Fixed problem with unset variable (compose_char) in yaz_iconv system
[yaz-moved-to-github.git] / src / siconv.c
1 /*
2  * Copyright (C) 1995-2006, Index Data ApS
3  * See the file LICENSE for details.
4  *
5  * $Id: siconv.c,v 1.29 2006-08-31 18:19:53 adam Exp $
6  */
7 /**
8  * \file siconv.c
9  * \brief Implements simple ICONV
10  *
11  * This implements an interface similar to that of iconv and
12  * is used by YAZ to interface with iconv (if present).
13  * For systems where iconv is not present, this layer
14  * provides a few important conversions: UTF-8, MARC-8, Latin-1.
15  *
16  * MARC-8 reference:
17  *  http://www.loc.gov/marc/specifications/speccharmarc8.html
18  */
19
20 #if HAVE_CONFIG_H
21 #include <config.h>
22 #endif
23
24 #include <assert.h>
25 #include <errno.h>
26 #include <string.h>
27 #include <ctype.h>
28 #if HAVE_WCHAR_H
29 #include <wchar.h>
30 #endif
31
32 #if HAVE_ICONV_H
33 #include <iconv.h>
34 #endif
35
36
37 #include <yaz/yaz-util.h>
38
39 unsigned long yaz_marc8_1_conv(unsigned char *inp, size_t inbytesleft,
40                                size_t *no_read, int *combining);
41 unsigned long yaz_marc8_2_conv(unsigned char *inp, size_t inbytesleft,
42                                size_t *no_read, int *combining);
43 unsigned long yaz_marc8_3_conv(unsigned char *inp, size_t inbytesleft,
44                                size_t *no_read, int *combining);
45 unsigned long yaz_marc8_4_conv(unsigned char *inp, size_t inbytesleft,
46                                size_t *no_read, int *combining);
47 unsigned long yaz_marc8_5_conv(unsigned char *inp, size_t inbytesleft,
48                                size_t *no_read, int *combining);
49 unsigned long yaz_marc8_6_conv(unsigned char *inp, size_t inbytesleft,
50                                size_t *no_read, int *combining);
51 unsigned long yaz_marc8_7_conv(unsigned char *inp, size_t inbytesleft,
52                                size_t *no_read, int *combining);
53 unsigned long yaz_marc8_8_conv(unsigned char *inp, size_t inbytesleft,
54                                size_t *no_read, int *combining);
55 unsigned long yaz_marc8_9_conv(unsigned char *inp, size_t inbytesleft,
56                                size_t *no_read, int *combining);
57
58
59 unsigned long yaz_marc8r_1_conv(unsigned char *inp, size_t inbytesleft,
60                                 size_t *no_read, int *combining);
61 unsigned long yaz_marc8r_2_conv(unsigned char *inp, size_t inbytesleft,
62                                 size_t *no_read, int *combining);
63 unsigned long yaz_marc8r_3_conv(unsigned char *inp, size_t inbytesleft,
64                                 size_t *no_read, int *combining);
65 unsigned long yaz_marc8r_4_conv(unsigned char *inp, size_t inbytesleft,
66                                 size_t *no_read, int *combining);
67 unsigned long yaz_marc8r_5_conv(unsigned char *inp, size_t inbytesleft,
68                                 size_t *no_read, int *combining);
69 unsigned long yaz_marc8r_6_conv(unsigned char *inp, size_t inbytesleft,
70                                 size_t *no_read, int *combining);
71 unsigned long yaz_marc8r_7_conv(unsigned char *inp, size_t inbytesleft,
72                                 size_t *no_read, int *combining);
73 unsigned long yaz_marc8r_8_conv(unsigned char *inp, size_t inbytesleft,
74                                 size_t *no_read, int *combining);
75 unsigned long yaz_marc8r_9_conv(unsigned char *inp, size_t inbytesleft,
76                                 size_t *no_read, int *combining);
77
78 struct yaz_iconv_struct {
79     int my_errno;
80     int init_flag;
81     size_t (*init_handle)(yaz_iconv_t cd, unsigned char *inbuf,
82                           size_t inbytesleft, size_t *no_read);
83     unsigned long (*read_handle)(yaz_iconv_t cd, unsigned char *inbuf,
84                                  size_t inbytesleft, size_t *no_read);
85     size_t (*write_handle)(yaz_iconv_t cd, unsigned long x,
86                            char **outbuf, size_t *outbytesleft,
87                            int last);
88     int marc8_esc_mode;
89
90     int comb_offset;
91     int comb_size;
92     unsigned long comb_x[8];
93     size_t comb_no_read[8];
94     size_t no_read_x;
95     unsigned long unget_x;
96 #if HAVE_ICONV_H
97     iconv_t iconv_cd;
98 #endif
99     unsigned long compose_char;
100
101     unsigned long write_marc8_comb_ch[8];
102     size_t write_marc8_comb_no;
103     unsigned long write_marc8_last;
104     const char *write_marc8_page_chr;
105 };
106
107 static struct {
108     unsigned long x1, x2;
109     unsigned y;
110 } latin1_comb[] = {
111     { 'A', 0x0300, 0xc0}, /* LATIN CAPITAL LETTER A WITH GRAVE */
112     { 'A', 0x0301, 0xc1}, /* LATIN CAPITAL LETTER A WITH ACUTE */
113     { 'A', 0x0302, 0xc2}, /* LATIN CAPITAL LETTER A WITH CIRCUMFLEX */
114     { 'A', 0x0303, 0xc3}, /* LATIN CAPITAL LETTER A WITH TILDE */
115     { 'A', 0x0308, 0xc4}, /* LATIN CAPITAL LETTER A WITH DIAERESIS */
116     { 'A', 0x030a, 0xc5}, /* LATIN CAPITAL LETTER A WITH RING ABOVE */
117     /* no need for 0xc6      LATIN CAPITAL LETTER AE */
118     { 'C', 0x0327, 0xc7}, /* LATIN CAPITAL LETTER C WITH CEDILLA */
119     { 'E', 0x0300, 0xc8}, /* LATIN CAPITAL LETTER E WITH GRAVE */
120     { 'E', 0x0301, 0xc9}, /* LATIN CAPITAL LETTER E WITH ACUTE */
121     { 'E', 0x0302, 0xca}, /* LATIN CAPITAL LETTER E WITH CIRCUMFLEX */
122     { 'E', 0x0308, 0xcb}, /* LATIN CAPITAL LETTER E WITH DIAERESIS */
123     { 'I', 0x0300, 0xcc}, /* LATIN CAPITAL LETTER I WITH GRAVE */
124     { 'I', 0x0301, 0xcd}, /* LATIN CAPITAL LETTER I WITH ACUTE */
125     { 'I', 0x0302, 0xce}, /* LATIN CAPITAL LETTER I WITH CIRCUMFLEX */
126     { 'I', 0x0308, 0xcf}, /* LATIN CAPITAL LETTER I WITH DIAERESIS */
127     { 'N', 0x0303, 0xd1}, /* LATIN CAPITAL LETTER N WITH TILDE */
128     { 'O', 0x0300, 0xd2}, /* LATIN CAPITAL LETTER O WITH GRAVE */
129     { 'O', 0x0301, 0xd3}, /* LATIN CAPITAL LETTER O WITH ACUTE */
130     { 'O', 0x0302, 0xd4}, /* LATIN CAPITAL LETTER O WITH CIRCUMFLEX */
131     { 'O', 0x0303, 0xd5}, /* LATIN CAPITAL LETTER O WITH TILDE */
132     { 'O', 0x0308, 0xd6}, /* LATIN CAPITAL LETTER O WITH DIAERESIS */
133     /* omitted:    0xd7      MULTIPLICATION SIGN */
134     /* omitted:    0xd8      LATIN CAPITAL LETTER O WITH STROKE */
135     { 'U', 0x0300, 0xd9}, /* LATIN CAPITAL LETTER U WITH GRAVE */
136     { 'U', 0x0301, 0xda}, /* LATIN CAPITAL LETTER U WITH ACUTE */
137     { 'U', 0x0302, 0xdb}, /* LATIN CAPITAL LETTER U WITH CIRCUMFLEX */
138     { 'U', 0x0308, 0xdc}, /* LATIN CAPITAL LETTER U WITH DIAERESIS */
139     { 'Y', 0x0301, 0xdd}, /* LATIN CAPITAL LETTER Y WITH ACUTE */
140     /* omitted:    0xde      LATIN CAPITAL LETTER THORN */
141     /* omitted:    0xdf      LATIN SMALL LETTER SHARP S */
142     { 'a', 0x0300, 0xe0}, /* LATIN SMALL LETTER A WITH GRAVE */
143     { 'a', 0x0301, 0xe1}, /* LATIN SMALL LETTER A WITH ACUTE */
144     { 'a', 0x0302, 0xe2}, /* LATIN SMALL LETTER A WITH CIRCUMFLEX */
145     { 'a', 0x0303, 0xe3}, /* LATIN SMALL LETTER A WITH TILDE */
146     { 'a', 0x0308, 0xe4}, /* LATIN SMALL LETTER A WITH DIAERESIS */
147     { 'a', 0x030a, 0xe5}, /* LATIN SMALL LETTER A WITH RING ABOVE */
148     /* omitted:    0xe6      LATIN SMALL LETTER AE */
149     { 'c', 0x0327, 0xe7}, /* LATIN SMALL LETTER C WITH CEDILLA */
150     { 'e', 0x0300, 0xe8}, /* LATIN SMALL LETTER E WITH GRAVE */
151     { 'e', 0x0301, 0xe9}, /* LATIN SMALL LETTER E WITH ACUTE */
152     { 'e', 0x0302, 0xea}, /* LATIN SMALL LETTER E WITH CIRCUMFLEX */
153     { 'e', 0x0308, 0xeb}, /* LATIN SMALL LETTER E WITH DIAERESIS */
154     { 'i', 0x0300, 0xec}, /* LATIN SMALL LETTER I WITH GRAVE */
155     { 'i', 0x0301, 0xed}, /* LATIN SMALL LETTER I WITH ACUTE */
156     { 'i', 0x0302, 0xee}, /* LATIN SMALL LETTER I WITH CIRCUMFLEX */
157     { 'i', 0x0308, 0xef}, /* LATIN SMALL LETTER I WITH DIAERESIS */
158     /* omitted:    0xf0      LATIN SMALL LETTER ETH */
159     { 'n', 0x0303, 0xf1}, /* LATIN SMALL LETTER N WITH TILDE */
160     { 'o', 0x0300, 0xf2}, /* LATIN SMALL LETTER O WITH GRAVE */
161     { 'o', 0x0301, 0xf3}, /* LATIN SMALL LETTER O WITH ACUTE */
162     { 'o', 0x0302, 0xf4}, /* LATIN SMALL LETTER O WITH CIRCUMFLEX */
163     { 'o', 0x0303, 0xf5}, /* LATIN SMALL LETTER O WITH TILDE */
164     { 'o', 0x0308, 0xf6}, /* LATIN SMALL LETTER O WITH DIAERESIS */
165     /* omitted:    0xf7      DIVISION SIGN */
166     /* omitted:    0xf8      LATIN SMALL LETTER O WITH STROKE */
167     { 'u', 0x0300, 0xf9}, /* LATIN SMALL LETTER U WITH GRAVE */
168     { 'u', 0x0301, 0xfa}, /* LATIN SMALL LETTER U WITH ACUTE */
169     { 'u', 0x0302, 0xfb}, /* LATIN SMALL LETTER U WITH CIRCUMFLEX */
170     { 'u', 0x0308, 0xfc}, /* LATIN SMALL LETTER U WITH DIAERESIS */
171     { 'y', 0x0301, 0xfd}, /* LATIN SMALL LETTER Y WITH ACUTE */
172     /* omitted:    0xfe      LATIN SMALL LETTER THORN */
173     { 'y', 0x0308, 0xff}, /* LATIN SMALL LETTER Y WITH DIAERESIS */
174     
175     { 0, 0, 0}
176 };
177
178 static unsigned long yaz_read_ISO8859_1 (yaz_iconv_t cd, unsigned char *inp,
179                                          size_t inbytesleft, size_t *no_read)
180 {
181     unsigned long x = inp[0];
182     *no_read = 1;
183     return x;
184 }
185
186
187 static size_t yaz_init_UTF8 (yaz_iconv_t cd, unsigned char *inp,
188                              size_t inbytesleft, size_t *no_read)
189 {
190     if (inp[0] != 0xef)
191     {
192         *no_read = 0;
193         return 0;
194     }
195     if (inbytesleft < 3)
196     {
197         cd->my_errno = YAZ_ICONV_EINVAL;
198         return (size_t) -1;
199     }
200     if (inp[1] != 0xbb && inp[2] == 0xbf)
201         *no_read = 3;
202     else
203         *no_read = 0;
204     return 0;
205 }
206
207 unsigned long yaz_read_UTF8_char(unsigned char *inp,
208                                  size_t inbytesleft, size_t *no_read,
209                                  int *error)
210 {
211     unsigned long x = 0;
212
213     if (inp[0] <= 0x7f)
214     {
215         x = inp[0];
216         *no_read = 1;
217     }
218     else if (inp[0] <= 0xbf || inp[0] >= 0xfe)
219     {
220         *no_read = 0;
221         *error = YAZ_ICONV_EILSEQ;
222     }
223     else if (inp[0] <= 0xdf && inbytesleft >= 2)
224     {
225         x = ((inp[0] & 0x1f) << 6) | (inp[1] & 0x3f);
226         if (x >= 0x80)
227             *no_read = 2;
228         else
229         {
230             *no_read = 0;
231             *error = YAZ_ICONV_EILSEQ;
232         }
233     }
234     else if (inp[0] <= 0xef && inbytesleft >= 3)
235     {
236         x = ((inp[0] & 0x0f) << 12) | ((inp[1] & 0x3f) << 6) |
237             (inp[2] & 0x3f);
238         if (x >= 0x800)
239             *no_read = 3;
240         else
241         {
242             *no_read = 0;
243             *error = YAZ_ICONV_EILSEQ;
244         }
245     }
246     else if (inp[0] <= 0xf7 && inbytesleft >= 4)
247     {
248         x =  ((inp[0] & 0x07) << 18) | ((inp[1] & 0x3f) << 12) |
249             ((inp[2] & 0x3f) << 6) | (inp[3] & 0x3f);
250         if (x >= 0x10000)
251             *no_read = 4;
252         else
253         {
254             *no_read = 0;
255             *error = YAZ_ICONV_EILSEQ;
256         }
257     }
258     else if (inp[0] <= 0xfb && inbytesleft >= 5)
259     {
260         x =  ((inp[0] & 0x03) << 24) | ((inp[1] & 0x3f) << 18) |
261             ((inp[2] & 0x3f) << 12) | ((inp[3] & 0x3f) << 6) |
262             (inp[4] & 0x3f);
263         if (x >= 0x200000)
264             *no_read = 5;
265         else
266         {
267             *no_read = 0;
268             *error = YAZ_ICONV_EILSEQ;
269         }
270     }
271     else if (inp[0] <= 0xfd && inbytesleft >= 6)
272     {
273         x =  ((inp[0] & 0x01) << 30) | ((inp[1] & 0x3f) << 24) |
274             ((inp[2] & 0x3f) << 18) | ((inp[3] & 0x3f) << 12) |
275             ((inp[4] & 0x3f) << 6) | (inp[5] & 0x3f);
276         if (x >= 0x4000000)
277             *no_read = 6;
278         else
279         {
280             *no_read = 0;
281             *error = YAZ_ICONV_EILSEQ;
282         }
283     }
284     else
285     {
286         *no_read = 0;
287         *error = YAZ_ICONV_EINVAL;
288     }
289     return x;
290 }
291
292 static unsigned long yaz_read_UTF8 (yaz_iconv_t cd, unsigned char *inp,
293                                     size_t inbytesleft, size_t *no_read)
294 {
295     return yaz_read_UTF8_char(inp, inbytesleft, no_read, &cd->my_errno);
296 }
297
298 static unsigned long yaz_read_UCS4 (yaz_iconv_t cd, unsigned char *inp,
299                                     size_t inbytesleft, size_t *no_read)
300 {
301     unsigned long x = 0;
302     
303     if (inbytesleft < 4)
304     {
305         cd->my_errno = YAZ_ICONV_EINVAL; /* incomplete input */
306         *no_read = 0;
307     }
308     else
309     {
310         x = (inp[0]<<24) | (inp[1]<<16) | (inp[2]<<8) | inp[3];
311         *no_read = 4;
312     }
313     return x;
314 }
315
316 static unsigned long yaz_read_UCS4LE (yaz_iconv_t cd, unsigned char *inp,
317                                       size_t inbytesleft, size_t *no_read)
318 {
319     unsigned long x = 0;
320     
321     if (inbytesleft < 4)
322     {
323         cd->my_errno = YAZ_ICONV_EINVAL; /* incomplete input */
324         *no_read = 0;
325     }
326     else
327     {
328         x = (inp[3]<<24) | (inp[2]<<16) | (inp[1]<<8) | inp[0];
329         *no_read = 4;
330     }
331     return x;
332 }
333
334 #if HAVE_WCHAR_H
335 static unsigned long yaz_read_wchar_t (yaz_iconv_t cd, unsigned char *inp,
336                                        size_t inbytesleft, size_t *no_read)
337 {
338     unsigned long x = 0;
339     
340     if (inbytesleft < sizeof(wchar_t))
341     {
342         cd->my_errno = YAZ_ICONV_EINVAL; /* incomplete input */
343         *no_read = 0;
344     }
345     else
346     {
347         wchar_t wch;
348         memcpy (&wch, inp, sizeof(wch));
349         x = wch;
350         *no_read = sizeof(wch);
351     }
352     return x;
353 }
354 #endif
355
356
357 static unsigned long yaz_read_marc8_comb (yaz_iconv_t cd, unsigned char *inp,
358                                           size_t inbytesleft, size_t *no_read,
359                                           int *comb);
360
361 static unsigned long yaz_read_marc8 (yaz_iconv_t cd, unsigned char *inp,
362                                      size_t inbytesleft, size_t *no_read)
363 {
364     unsigned long x;
365     if (cd->comb_offset < cd->comb_size)
366     {
367         *no_read = cd->comb_no_read[cd->comb_offset];
368         x = cd->comb_x[cd->comb_offset];
369
370         /* special case for double-diacritic combining characters, 
371            INVERTED BREVE and DOUBLE TILDE.
372            We'll increment the no_read counter by 1, since we want to skip over
373            the processing of the closing ligature character
374         */
375         /* this code is no longer necessary.. our handlers code in
376            yaz_marc8_?_conv (generated by charconv.tcl) now returns
377            0 and no_read=1 when a sequence does not match the input.
378            The SECOND HALFs in codetables.xml produces a non-existant
379            entry in the conversion trie.. Hence when met, the input byte is
380            skipped as it should (in yaz_iconv)
381         */
382 #if 0
383         if (x == 0x0361 || x == 0x0360)
384             *no_read += 1;
385 #endif
386         cd->comb_offset++;
387         return x;
388     }
389
390     cd->comb_offset = 0;
391     for (cd->comb_size = 0; cd->comb_size < 8; cd->comb_size++)
392     {
393         int comb = 0;
394         x = yaz_read_marc8_comb(cd, inp, inbytesleft, no_read, &comb);
395         if (!comb || !x)
396             break;
397         cd->comb_x[cd->comb_size] = x;
398         cd->comb_no_read[cd->comb_size] = *no_read;
399         inp += *no_read;
400         inbytesleft = inbytesleft - *no_read;
401     }
402     return x;
403 }
404
405 static unsigned long yaz_read_marc8s(yaz_iconv_t cd, unsigned char *inp,
406                                      size_t inbytesleft, size_t *no_read)
407 {
408     unsigned long x = yaz_read_marc8(cd, inp, inbytesleft, no_read);
409     if (x && cd->comb_size == 1)
410     {
411         /* For MARC8s we try to get a Latin-1 page code out of it */
412         int i;
413         for (i = 0; latin1_comb[i].x1; i++)
414             if (cd->comb_x[0] == latin1_comb[i].x2 && x == latin1_comb[i].x1)
415             {
416                 *no_read += cd->comb_no_read[0];
417                 cd->comb_size = 0;
418                 x = latin1_comb[i].y;
419                 break;
420             }
421     }
422     return x;
423 }
424
425 static unsigned long yaz_read_marc8_comb(yaz_iconv_t cd, unsigned char *inp,
426                                          size_t inbytesleft, size_t *no_read,
427                                          int *comb)
428 {
429     *no_read = 0;
430     while(inbytesleft >= 1 && inp[0] == 27)
431     {
432         size_t inbytesleft0 = inbytesleft;
433         inp++;
434         inbytesleft--;
435         while(inbytesleft > 0 && strchr("(,$!", *inp))
436         {
437             inbytesleft--;
438             inp++;
439         }
440         if (inbytesleft <= 0)
441         {
442             *no_read = 0;
443             cd->my_errno = YAZ_ICONV_EINVAL;
444             return 0;
445         }
446         cd->marc8_esc_mode = *inp++;
447         inbytesleft--;
448         (*no_read) += inbytesleft0 - inbytesleft;
449     }
450     if (inbytesleft <= 0)
451         return 0;
452     else
453     {
454         unsigned long x;
455         size_t no_read_sub = 0;
456         *comb = 0;
457
458         switch(cd->marc8_esc_mode)
459         {
460         case 'B':  /* Basic ASCII */
461         case 'E':  /* ANSEL */
462         case 's':  /* ASCII */
463             x = yaz_marc8_1_conv(inp, inbytesleft, &no_read_sub, comb);
464             break;
465         case 'g':  /* Greek */
466             x = yaz_marc8_2_conv(inp, inbytesleft, &no_read_sub, comb);
467             break;
468         case 'b':  /* Subscripts */
469             x = yaz_marc8_3_conv(inp, inbytesleft, &no_read_sub, comb);
470             break;
471         case 'p':  /* Superscripts */
472             x = yaz_marc8_4_conv(inp, inbytesleft, &no_read_sub, comb);
473             break;
474         case '2':  /* Basic Hebrew */
475             x = yaz_marc8_5_conv(inp, inbytesleft, &no_read_sub, comb);
476             break;
477         case 'N':  /* Basic Cyrillic */
478         case 'Q':  /* Extended Cyrillic */
479             x = yaz_marc8_6_conv(inp, inbytesleft, &no_read_sub, comb);
480             break;
481         case '3':  /* Basic Arabic */
482         case '4':  /* Extended Arabic */
483             x = yaz_marc8_7_conv(inp, inbytesleft, &no_read_sub, comb);
484             break;
485         case 'S':  /* Greek */
486             x = yaz_marc8_8_conv(inp, inbytesleft, &no_read_sub, comb);
487             break;
488         case '1':  /* Chinese, Japanese, Korean (EACC) */
489             x = yaz_marc8_9_conv(inp, inbytesleft, &no_read_sub, comb);
490             break;
491         default:
492             *no_read = 0;
493             cd->my_errno = YAZ_ICONV_EILSEQ;
494             return 0;
495         }
496         *no_read += no_read_sub;
497         return x;
498     }
499 }
500
501 static size_t yaz_write_UTF8(yaz_iconv_t cd, unsigned long x,
502                              char **outbuf, size_t *outbytesleft,
503                              int last)
504 {
505     return yaz_write_UTF8_char(x, outbuf, outbytesleft, &cd->my_errno);
506 }
507
508 size_t yaz_write_UTF8_char(unsigned long x,
509                            char **outbuf, size_t *outbytesleft,
510                            int *error)
511 {
512     unsigned char *outp = (unsigned char *) *outbuf;
513
514     if (x <= 0x7f && *outbytesleft >= 1)
515     {
516         *outp++ = (unsigned char) x;
517         (*outbytesleft)--;
518     } 
519     else if (x <= 0x7ff && *outbytesleft >= 2)
520     {
521         *outp++ = (unsigned char) ((x >> 6) | 0xc0);
522         *outp++ = (unsigned char) ((x & 0x3f) | 0x80);
523         (*outbytesleft) -= 2;
524     }
525     else if (x <= 0xffff && *outbytesleft >= 3)
526     {
527         *outp++ = (unsigned char) ((x >> 12) | 0xe0);
528         *outp++ = (unsigned char) (((x >> 6) & 0x3f) | 0x80);
529         *outp++ = (unsigned char) ((x & 0x3f) | 0x80);
530         (*outbytesleft) -= 3;
531     }
532     else if (x <= 0x1fffff && *outbytesleft >= 4)
533     {
534         *outp++ = (unsigned char) ((x >> 18) | 0xf0);
535         *outp++ = (unsigned char) (((x >> 12) & 0x3f) | 0x80);
536         *outp++ = (unsigned char) (((x >> 6)  & 0x3f) | 0x80);
537         *outp++ = (unsigned char) ((x & 0x3f) | 0x80);
538         (*outbytesleft) -= 4;
539     }
540     else if (x <= 0x3ffffff && *outbytesleft >= 5)
541     {
542         *outp++ = (unsigned char) ((x >> 24) | 0xf8);
543         *outp++ = (unsigned char) (((x >> 18) & 0x3f) | 0x80);
544         *outp++ = (unsigned char) (((x >> 12) & 0x3f) | 0x80);
545         *outp++ = (unsigned char) (((x >> 6)  & 0x3f) | 0x80);
546         *outp++ = (unsigned char) ((x & 0x3f) | 0x80);
547         (*outbytesleft) -= 5;
548     }
549     else if (*outbytesleft >= 6)
550     {
551         *outp++ = (unsigned char) ((x >> 30) | 0xfc);
552         *outp++ = (unsigned char) (((x >> 24) & 0x3f) | 0x80);
553         *outp++ = (unsigned char) (((x >> 18) & 0x3f) | 0x80);
554         *outp++ = (unsigned char) (((x >> 12) & 0x3f) | 0x80);
555         *outp++ = (unsigned char) (((x >> 6)  & 0x3f) | 0x80);
556         *outp++ = (unsigned char) ((x & 0x3f) | 0x80);
557         (*outbytesleft) -= 6;
558     }
559     else 
560     {
561         *error = YAZ_ICONV_E2BIG;  /* not room for output */
562         return (size_t)(-1);
563     }
564     *outbuf = (char *) outp;
565     return 0;
566 }
567
568
569 static size_t yaz_write_ISO8859_1 (yaz_iconv_t cd, unsigned long x,
570                                    char **outbuf, size_t *outbytesleft,
571                                    int last)
572 {
573     /* list of two char unicode sequence that, when combined, are
574        equivalent to single unicode chars that can be represented in
575        ISO-8859-1/Latin-1.
576        Regular iconv on Linux at least does not seem to convert these,
577        but since MARC-8 to UTF-8 generates these composed sequence
578        we get a better chance of a successful MARC-8 -> ISO-8859-1
579        conversion */
580     unsigned char *outp = (unsigned char *) *outbuf;
581
582     if (cd->compose_char)
583     {
584         int i;
585         for (i = 0; latin1_comb[i].x1; i++)
586             if (cd->compose_char == latin1_comb[i].x1 && x == latin1_comb[i].x2)
587             {
588                 x = latin1_comb[i].y;
589                 break;
590             }
591         if (*outbytesleft < 1)
592         {  /* no room. Retain compose_char and bail out */
593             cd->my_errno = YAZ_ICONV_E2BIG;
594             return (size_t)(-1);
595         }
596         if (!latin1_comb[i].x1) 
597         {   /* not found. Just write compose_char */
598             *outp++ = (unsigned char) cd->compose_char;
599             (*outbytesleft)--;
600             *outbuf = (char *) outp;
601         }
602         /* compose_char used so reset it. x now holds current char */
603         cd->compose_char = 0;
604     }
605
606     if (!last && x > 32 && x < 127 && cd->compose_char == 0)
607     {
608         cd->compose_char = x;
609         return 0;
610     }
611     else if (x > 255 || x < 1)
612     {
613         cd->my_errno = YAZ_ICONV_EILSEQ;
614         return (size_t) -1;
615     }
616     else if (*outbytesleft < 1)
617     {
618         cd->my_errno = YAZ_ICONV_E2BIG;
619         return (size_t)(-1);
620     }
621     *outp++ = (unsigned char) x;
622     (*outbytesleft)--;
623     *outbuf = (char *) outp;
624     return 0;
625 }
626
627
628 static size_t yaz_write_UCS4 (yaz_iconv_t cd, unsigned long x,
629                               char **outbuf, size_t *outbytesleft,
630                               int last)
631 {
632     unsigned char *outp = (unsigned char *) *outbuf;
633     if (*outbytesleft >= 4)
634     {
635         *outp++ = (unsigned char) (x>>24);
636         *outp++ = (unsigned char) (x>>16);
637         *outp++ = (unsigned char) (x>>8);
638         *outp++ = (unsigned char) x;
639         (*outbytesleft) -= 4;
640     }
641     else
642     {
643         cd->my_errno = YAZ_ICONV_E2BIG;
644         return (size_t)(-1);
645     }
646     *outbuf = (char *) outp;
647     return 0;
648 }
649
650 static size_t yaz_write_UCS4LE (yaz_iconv_t cd, unsigned long x,
651                                 char **outbuf, size_t *outbytesleft,
652                                 int last)
653 {
654     unsigned char *outp = (unsigned char *) *outbuf;
655     if (*outbytesleft >= 4)
656     {
657         *outp++ = (unsigned char) x;
658         *outp++ = (unsigned char) (x>>8);
659         *outp++ = (unsigned char) (x>>16);
660         *outp++ = (unsigned char) (x>>24);
661         (*outbytesleft) -= 4;
662     }
663     else
664     {
665         cd->my_errno = YAZ_ICONV_E2BIG;
666         return (size_t)(-1);
667     }
668     *outbuf = (char *) outp;
669     return 0;
670 }
671
672 static unsigned long lookup_marc8(yaz_iconv_t cd,
673                                   unsigned long x, int *comb,
674                                   const char **page_chr)
675 {
676     char utf8_buf[7];
677     char *utf8_outbuf = utf8_buf;
678     size_t utf8_outbytesleft = sizeof(utf8_buf)-1, r;
679
680     r = yaz_write_UTF8(cd, x, &utf8_outbuf, &utf8_outbytesleft, 0);
681     if (r == (size_t)(-1))
682     {
683         cd->my_errno = YAZ_ICONV_EILSEQ;
684         return 0;
685     }
686     else
687     {
688         unsigned char *inp;
689         size_t inbytesleft, no_read_sub = 0;
690         unsigned long x;
691
692         *utf8_outbuf = '\0';        
693         inp = (unsigned char *) utf8_buf;
694         inbytesleft = strlen(utf8_buf);
695         
696         x = yaz_marc8r_1_conv(inp, inbytesleft, &no_read_sub, comb);
697         if (x)
698         {
699             *page_chr = "\033(B";
700             return x;
701         }
702         x = yaz_marc8r_2_conv(inp, inbytesleft, &no_read_sub, comb);
703         if (x)
704         {
705             *page_chr = "\033g";
706             return x;
707         }
708         x = yaz_marc8r_3_conv(inp, inbytesleft, &no_read_sub, comb);
709         if (x)
710         {
711             *page_chr = "\033b";
712             return x;
713         }
714         x = yaz_marc8r_4_conv(inp, inbytesleft, &no_read_sub, comb);
715         if (x)
716         {
717             *page_chr = "\033p";
718             return x;
719         }
720         x = yaz_marc8r_5_conv(inp, inbytesleft, &no_read_sub, comb);
721         if (x)
722         {
723             *page_chr = "\033(2";
724             return x;
725         }
726         x = yaz_marc8r_6_conv(inp, inbytesleft, &no_read_sub, comb);
727         if (x)
728         {
729             *page_chr = "\033(N";
730             return x;
731         }
732         x = yaz_marc8r_7_conv(inp, inbytesleft, &no_read_sub, comb);
733         if (x)
734         {
735             *page_chr = "\033(3";
736             return x;
737         }
738         x = yaz_marc8r_8_conv(inp, inbytesleft, &no_read_sub, comb);
739         if (x)
740         {
741             *page_chr = "\033(S";
742             return x;
743         }
744         x = yaz_marc8r_9_conv(inp, inbytesleft, &no_read_sub, comb);
745         if (x)
746         {
747             *page_chr = "\033$1";
748             return x;
749         }
750         cd->my_errno = YAZ_ICONV_EILSEQ;
751         return x;
752     }
753 }
754
755 static size_t flush_combos(yaz_iconv_t cd,
756                            char **outbuf, size_t *outbytesleft)
757 {
758     unsigned long y = cd->write_marc8_last;
759     unsigned char byte, second_half = 0;
760     char out_buf[10];
761     size_t i, out_no = 0;
762
763     if (!y)
764         return 0;
765
766     byte = (unsigned char )((y>>16) & 0xff);
767     if (byte)
768         out_buf[out_no++] = byte;
769     byte = (unsigned char)((y>>8) & 0xff);
770     if (byte)
771         out_buf[out_no++] = byte;
772     byte = (unsigned char )(y & 0xff);
773     if (byte)
774         out_buf[out_no++] = byte;
775
776     if (out_no + cd->write_marc8_comb_no + 1 > *outbytesleft)
777     {
778         cd->my_errno = YAZ_ICONV_E2BIG;
779         return (size_t) (-1);
780     }
781
782     for (i = 0; i < cd->write_marc8_comb_no; i++)
783     {
784         /* all MARC-8 combined characters are simple bytes */
785         byte = (unsigned char )(cd->write_marc8_comb_ch[i]);
786         if (byte == 0xEB)
787             second_half = 0xEC;
788         else if (byte == 0xFA)
789             second_half = 0xFB;
790
791         *(*outbuf)++ = byte;
792         (*outbytesleft)--;
793     }
794     memcpy(*outbuf, out_buf, out_no);
795     *outbuf += out_no;
796     (*outbytesleft) -= out_no;
797     if (second_half)
798     {
799         *(*outbuf)++ = second_half;
800         (*outbytesleft)--;
801     }        
802
803     cd->write_marc8_last = 0;
804     cd->write_marc8_comb_no = 0;
805     return 0;
806 }
807
808 static size_t yaz_write_marc8_2(yaz_iconv_t cd, unsigned long x,
809                                 char **outbuf, size_t *outbytesleft,
810                                 int last)
811 {
812     int comb = 0;
813     const char *page_chr = 0;
814     unsigned long y = lookup_marc8(cd, x, &comb, &page_chr);
815
816     if (!y)
817         return (size_t) (-1);
818
819     if (comb)
820     {
821         if (cd->write_marc8_comb_no < 6)
822             cd->write_marc8_comb_ch[cd->write_marc8_comb_no++] = y;
823     }
824     else
825     {
826         size_t r = flush_combos(cd, outbuf, outbytesleft);
827         const char *old_page_chr = cd->write_marc8_page_chr;
828         if (r)
829             return r;
830         if (strcmp(page_chr, old_page_chr))
831         {
832             size_t plen = 0;
833             const char *page_out = page_chr;
834
835             if (*outbytesleft < 8)
836             {
837                 cd->my_errno = YAZ_ICONV_E2BIG;
838                 
839                 return (size_t) (-1);
840             }
841             cd->write_marc8_page_chr = page_chr;
842
843             if (!strcmp(old_page_chr, "\033p") 
844                 || !strcmp(old_page_chr, "\033g")
845                 || !strcmp(old_page_chr, "\033b"))
846             {
847                 /* Technique 1 leave */
848                 page_out = "\033s";
849                 if (strcmp(page_chr, "\033(B")) /* Not going ASCII page? */
850                 {
851                     /* Must leave script + enter new page */
852                     plen = strlen(page_out);
853                     memcpy(*outbuf, page_out, plen);
854                     (*outbuf) += plen;
855                     (*outbytesleft) -= plen;
856                     page_out = page_chr;
857                 }
858             }
859             plen = strlen(page_out);
860             memcpy(*outbuf, page_out, plen);
861             (*outbuf) += plen;
862             (*outbytesleft) -= plen;
863         }
864         cd->write_marc8_last = y;
865     }
866     if (last)
867     {
868         size_t r = flush_combos(cd, outbuf, outbytesleft);
869         if (r)
870         {
871             if (comb)
872                 cd->write_marc8_comb_no--;
873             else
874                 cd->write_marc8_last = 0;
875             return r;
876         }
877     }
878     return 0;
879 }
880
881 static size_t yaz_write_marc8(yaz_iconv_t cd, unsigned long x,
882                               char **outbuf, size_t *outbytesleft,
883                               int last)
884 {
885     int i;
886     for (i = 0; latin1_comb[i].x1; i++)
887     {
888         if (x == latin1_comb[i].y)
889         {
890             size_t r ;
891             /* save the output pointers .. */
892             char *outbuf0 = *outbuf;
893             size_t outbytesleft0 = *outbytesleft;
894             int last_ch = cd->write_marc8_last;
895
896             r = yaz_write_marc8_2(cd, latin1_comb[i].x1,
897                                   outbuf, outbytesleft, 0);
898             if (r)
899                 return r;
900             r = yaz_write_marc8_2(cd, latin1_comb[i].x2,
901                                   outbuf, outbytesleft, last);
902             if (r && cd->my_errno == YAZ_ICONV_E2BIG)
903             {
904                 /* not enough room. reset output to original values */
905                 *outbuf = outbuf0;
906                 *outbytesleft = outbytesleft0;
907                 cd->write_marc8_last = last_ch;
908             }
909             return r;
910         }
911     }
912     return yaz_write_marc8_2(cd, x, outbuf, outbytesleft, last);
913 }
914
915
916 #if HAVE_WCHAR_H
917 static size_t yaz_write_wchar_t (yaz_iconv_t cd, unsigned long x,
918                                  char **outbuf, size_t *outbytesleft,
919                                  int last)
920 {
921     unsigned char *outp = (unsigned char *) *outbuf;
922
923     if (*outbytesleft >= sizeof(wchar_t))
924     {
925         wchar_t wch = x;
926         memcpy(outp, &wch, sizeof(wch));
927         outp += sizeof(wch);
928         (*outbytesleft) -= sizeof(wch);
929     }
930     else
931     {
932         cd->my_errno = YAZ_ICONV_E2BIG;
933         return (size_t)(-1);
934     }
935     *outbuf = (char *) outp;
936     return 0;
937 }
938 #endif
939
940 int yaz_iconv_isbuiltin(yaz_iconv_t cd)
941 {
942     return cd->read_handle && cd->write_handle;
943 }
944
945 yaz_iconv_t yaz_iconv_open (const char *tocode, const char *fromcode)
946 {
947     yaz_iconv_t cd = (yaz_iconv_t) xmalloc (sizeof(*cd));
948
949     cd->write_handle = 0;
950     cd->read_handle = 0;
951     cd->init_handle = 0;
952     cd->my_errno = YAZ_ICONV_UNKNOWN;
953
954     /* a useful hack: if fromcode has leading @,
955        the library not use YAZ's own conversions .. */
956     if (fromcode[0] == '@')
957         fromcode++;
958     else
959     {
960         if (!yaz_matchstr(fromcode, "UTF8"))
961         {
962             cd->read_handle = yaz_read_UTF8;
963             cd->init_handle = yaz_init_UTF8;
964         }
965         else if (!yaz_matchstr(fromcode, "ISO88591"))
966             cd->read_handle = yaz_read_ISO8859_1;
967         else if (!yaz_matchstr(fromcode, "UCS4"))
968             cd->read_handle = yaz_read_UCS4;
969         else if (!yaz_matchstr(fromcode, "UCS4LE"))
970             cd->read_handle = yaz_read_UCS4LE;
971         else if (!yaz_matchstr(fromcode, "MARC8"))
972             cd->read_handle = yaz_read_marc8;
973         else if (!yaz_matchstr(fromcode, "MARC8s"))
974             cd->read_handle = yaz_read_marc8s;
975 #if HAVE_WCHAR_H
976         else if (!yaz_matchstr(fromcode, "WCHAR_T"))
977             cd->read_handle = yaz_read_wchar_t;
978 #endif
979         
980         if (!yaz_matchstr(tocode, "UTF8"))
981             cd->write_handle = yaz_write_UTF8;
982         else if (!yaz_matchstr(tocode, "ISO88591"))
983             cd->write_handle = yaz_write_ISO8859_1;
984         else if (!yaz_matchstr (tocode, "UCS4"))
985             cd->write_handle = yaz_write_UCS4;
986         else if (!yaz_matchstr(tocode, "UCS4LE"))
987             cd->write_handle = yaz_write_UCS4LE;
988         else if (!yaz_matchstr(tocode, "MARC8"))
989             cd->write_handle = yaz_write_marc8;
990         else if (!yaz_matchstr(tocode, "MARC8s"))
991             cd->write_handle = yaz_write_marc8;
992 #if HAVE_WCHAR_H
993         else if (!yaz_matchstr(tocode, "WCHAR_T"))
994             cd->write_handle = yaz_write_wchar_t;
995 #endif
996     }
997 #if HAVE_ICONV_H
998     cd->iconv_cd = 0;
999     if (!cd->read_handle || !cd->write_handle)
1000     {
1001         cd->iconv_cd = iconv_open (tocode, fromcode);
1002         if (cd->iconv_cd == (iconv_t) (-1))
1003         {
1004             xfree (cd);
1005             return 0;
1006         }
1007     }
1008 #else
1009     if (!cd->read_handle || !cd->write_handle)
1010     {
1011         xfree (cd);
1012         return 0;
1013     }
1014 #endif
1015     cd->init_flag = 1;
1016     return cd;
1017 }
1018
1019 size_t yaz_iconv(yaz_iconv_t cd, char **inbuf, size_t *inbytesleft,
1020                  char **outbuf, size_t *outbytesleft)
1021 {
1022     char *inbuf0;
1023     size_t r = 0;
1024
1025 #if HAVE_ICONV_H
1026     if (cd->iconv_cd)
1027     {
1028         size_t r =
1029             iconv(cd->iconv_cd, inbuf, inbytesleft, outbuf, outbytesleft);
1030         if (r == (size_t)(-1))
1031         {
1032             switch (yaz_errno())
1033             {
1034             case E2BIG:
1035                 cd->my_errno = YAZ_ICONV_E2BIG;
1036                 break;
1037             case EINVAL:
1038                 cd->my_errno = YAZ_ICONV_EINVAL;
1039                 break;
1040             case EILSEQ:
1041                 cd->my_errno = YAZ_ICONV_EILSEQ;
1042                 break;
1043             default:
1044                 cd->my_errno = YAZ_ICONV_UNKNOWN;
1045             }
1046         }
1047         return r;
1048     }
1049 #endif
1050     if (inbuf == 0 || *inbuf == 0)
1051     {
1052         cd->init_flag = 1;
1053         cd->my_errno = YAZ_ICONV_UNKNOWN;
1054         return 0;
1055     }
1056     inbuf0 = *inbuf;
1057
1058     if (cd->init_flag)
1059     {
1060         if (cd->init_handle)
1061         {
1062             size_t no_read = 0;
1063             size_t r = (cd->init_handle)(cd, (unsigned char *) *inbuf,
1064                                          *inbytesleft, &no_read);
1065             if (r)
1066             {
1067                 if (cd->my_errno == YAZ_ICONV_EINVAL)
1068                     return r;
1069                 cd->init_flag = 0;
1070                 return r;
1071             }
1072             *inbytesleft -= no_read;
1073             *inbuf += no_read;
1074         }
1075         cd->marc8_esc_mode = 'B';
1076         
1077         cd->comb_offset = cd->comb_size = 0;
1078         cd->compose_char = 0;
1079         
1080         cd->write_marc8_comb_no = 0;
1081         cd->write_marc8_last = 0;
1082         cd->write_marc8_page_chr = "\033(B";
1083         
1084         cd->init_flag = 0;
1085         cd->unget_x = 0;
1086         cd->no_read_x = 0;
1087     }
1088     while (1)
1089     {
1090         unsigned long x;
1091         size_t no_read;
1092
1093         if (*inbytesleft == 0)
1094         {
1095             r = *inbuf - inbuf0;
1096             break;
1097         }
1098         if (!cd->unget_x)
1099         {
1100             x = (cd->read_handle)(cd, (unsigned char *) *inbuf, *inbytesleft,
1101                                   &no_read);
1102             if (no_read == 0)
1103             {
1104                 r = (size_t)(-1);
1105                 break;
1106             }
1107         }
1108         else
1109         {
1110             x = cd->unget_x;
1111             no_read = cd->no_read_x;
1112         }
1113         if (x)
1114         {
1115             r = (cd->write_handle)(cd, x, outbuf, outbytesleft,
1116                                    (*inbytesleft - no_read) == 0 ? 1 : 0);
1117             if (r)
1118             {
1119                 /* unable to write it. save it because read_handle cannot
1120                    rewind .. */
1121                 if (cd->my_errno == YAZ_ICONV_E2BIG)
1122                 {
1123                     cd->unget_x = x;
1124                     cd->no_read_x = no_read;
1125                     break;
1126                 }
1127             }
1128             cd->unget_x = 0;
1129         }
1130         *inbytesleft -= no_read;
1131         (*inbuf) += no_read;
1132     }
1133     return r;
1134 }
1135
1136 int yaz_iconv_error (yaz_iconv_t cd)
1137 {
1138     return cd->my_errno;
1139 }
1140
1141 int yaz_iconv_close (yaz_iconv_t cd)
1142 {
1143 #if HAVE_ICONV_H
1144     if (cd->iconv_cd)
1145         iconv_close (cd->iconv_cd);
1146 #endif
1147     xfree (cd);
1148     return 0;
1149 }
1150
1151 /*
1152  * Local variables:
1153  * c-basic-offset: 4
1154  * indent-tabs-mode: nil
1155  * End:
1156  * vim: shiftwidth=4 tabstop=8 expandtab
1157  */
1158