1cd81ed659a29042b1549329c4a1eafe96cf38c7
[yaz-moved-to-github.git] / src / siconv.c
1 /*
2  * Copyright (C) 1995-2007, Index Data ApS
3  * See the file LICENSE for details.
4  *
5  * $Id: siconv.c,v 1.33 2007-01-18 14:45:05 adam Exp $
6  */
7 /**
8  * \file siconv.c
9  * \brief Implements simple ICONV
10  *
11  * This implements an interface similar to that of iconv and
12  * is used by YAZ to interface with iconv (if present).
13  * For systems where iconv is not present, this layer
14  * provides a few important conversions: UTF-8, MARC-8, Latin-1.
15  *
16  * MARC-8 reference:
17  *  http://www.loc.gov/marc/specifications/speccharmarc8.html
18  */
19
20 #if HAVE_CONFIG_H
21 #include <config.h>
22 #endif
23
24 #include <assert.h>
25 #include <errno.h>
26 #include <string.h>
27 #include <ctype.h>
28 #if HAVE_WCHAR_H
29 #include <wchar.h>
30 #endif
31
32 #if HAVE_ICONV_H
33 #include <iconv.h>
34 #endif
35
36
37 #include <yaz/yaz-util.h>
38
39 unsigned long yaz_marc8_1_conv(unsigned char *inp, size_t inbytesleft,
40                                size_t *no_read, int *combining);
41 unsigned long yaz_marc8_2_conv(unsigned char *inp, size_t inbytesleft,
42                                size_t *no_read, int *combining);
43 unsigned long yaz_marc8_3_conv(unsigned char *inp, size_t inbytesleft,
44                                size_t *no_read, int *combining);
45 unsigned long yaz_marc8_4_conv(unsigned char *inp, size_t inbytesleft,
46                                size_t *no_read, int *combining);
47 unsigned long yaz_marc8_5_conv(unsigned char *inp, size_t inbytesleft,
48                                size_t *no_read, int *combining);
49 unsigned long yaz_marc8_6_conv(unsigned char *inp, size_t inbytesleft,
50                                size_t *no_read, int *combining);
51 unsigned long yaz_marc8_7_conv(unsigned char *inp, size_t inbytesleft,
52                                size_t *no_read, int *combining);
53 unsigned long yaz_marc8_8_conv(unsigned char *inp, size_t inbytesleft,
54                                size_t *no_read, int *combining);
55 unsigned long yaz_marc8_9_conv(unsigned char *inp, size_t inbytesleft,
56                                size_t *no_read, int *combining);
57
58
59 unsigned long yaz_marc8r_1_conv(unsigned char *inp, size_t inbytesleft,
60                                 size_t *no_read, int *combining);
61 unsigned long yaz_marc8r_2_conv(unsigned char *inp, size_t inbytesleft,
62                                 size_t *no_read, int *combining);
63 unsigned long yaz_marc8r_3_conv(unsigned char *inp, size_t inbytesleft,
64                                 size_t *no_read, int *combining);
65 unsigned long yaz_marc8r_4_conv(unsigned char *inp, size_t inbytesleft,
66                                 size_t *no_read, int *combining);
67 unsigned long yaz_marc8r_5_conv(unsigned char *inp, size_t inbytesleft,
68                                 size_t *no_read, int *combining);
69 unsigned long yaz_marc8r_6_conv(unsigned char *inp, size_t inbytesleft,
70                                 size_t *no_read, int *combining);
71 unsigned long yaz_marc8r_7_conv(unsigned char *inp, size_t inbytesleft,
72                                 size_t *no_read, int *combining);
73 unsigned long yaz_marc8r_8_conv(unsigned char *inp, size_t inbytesleft,
74                                 size_t *no_read, int *combining);
75 unsigned long yaz_marc8r_9_conv(unsigned char *inp, size_t inbytesleft,
76                                 size_t *no_read, int *combining);
77
78 struct yaz_iconv_struct {
79     int my_errno;
80     int init_flag;
81     size_t (*init_handle)(yaz_iconv_t cd, unsigned char *inbuf,
82                           size_t inbytesleft, size_t *no_read);
83     unsigned long (*read_handle)(yaz_iconv_t cd, unsigned char *inbuf,
84                                  size_t inbytesleft, size_t *no_read);
85     size_t (*write_handle)(yaz_iconv_t cd, unsigned long x,
86                            char **outbuf, size_t *outbytesleft,
87                            int last);
88     size_t (*flush_handle)(yaz_iconv_t cd,
89                            char **outbuf, size_t *outbytesleft);
90     int marc8_esc_mode;
91
92     int comb_offset;
93     int comb_size;
94     unsigned long comb_x[8];
95     size_t comb_no_read[8];
96     size_t no_read_x;
97     unsigned long unget_x;
98 #if HAVE_ICONV_H
99     iconv_t iconv_cd;
100 #endif
101     unsigned long compose_char;
102
103     unsigned long write_marc8_comb_ch[8];
104     size_t write_marc8_comb_no;
105     unsigned write_marc8_second_half_char;
106     unsigned long write_marc8_last;
107     const char *write_marc8_page_chr;
108 };
109
110 static struct {
111     unsigned long x1, x2;
112     unsigned y;
113 } latin1_comb[] = {
114     { 'A', 0x0300, 0xc0}, /* LATIN CAPITAL LETTER A WITH GRAVE */
115     { 'A', 0x0301, 0xc1}, /* LATIN CAPITAL LETTER A WITH ACUTE */
116     { 'A', 0x0302, 0xc2}, /* LATIN CAPITAL LETTER A WITH CIRCUMFLEX */
117     { 'A', 0x0303, 0xc3}, /* LATIN CAPITAL LETTER A WITH TILDE */
118     { 'A', 0x0308, 0xc4}, /* LATIN CAPITAL LETTER A WITH DIAERESIS */
119     { 'A', 0x030a, 0xc5}, /* LATIN CAPITAL LETTER A WITH RING ABOVE */
120     /* no need for 0xc6      LATIN CAPITAL LETTER AE */
121     { 'C', 0x0327, 0xc7}, /* LATIN CAPITAL LETTER C WITH CEDILLA */
122     { 'E', 0x0300, 0xc8}, /* LATIN CAPITAL LETTER E WITH GRAVE */
123     { 'E', 0x0301, 0xc9}, /* LATIN CAPITAL LETTER E WITH ACUTE */
124     { 'E', 0x0302, 0xca}, /* LATIN CAPITAL LETTER E WITH CIRCUMFLEX */
125     { 'E', 0x0308, 0xcb}, /* LATIN CAPITAL LETTER E WITH DIAERESIS */
126     { 'I', 0x0300, 0xcc}, /* LATIN CAPITAL LETTER I WITH GRAVE */
127     { 'I', 0x0301, 0xcd}, /* LATIN CAPITAL LETTER I WITH ACUTE */
128     { 'I', 0x0302, 0xce}, /* LATIN CAPITAL LETTER I WITH CIRCUMFLEX */
129     { 'I', 0x0308, 0xcf}, /* LATIN CAPITAL LETTER I WITH DIAERESIS */
130     { 'N', 0x0303, 0xd1}, /* LATIN CAPITAL LETTER N WITH TILDE */
131     { 'O', 0x0300, 0xd2}, /* LATIN CAPITAL LETTER O WITH GRAVE */
132     { 'O', 0x0301, 0xd3}, /* LATIN CAPITAL LETTER O WITH ACUTE */
133     { 'O', 0x0302, 0xd4}, /* LATIN CAPITAL LETTER O WITH CIRCUMFLEX */
134     { 'O', 0x0303, 0xd5}, /* LATIN CAPITAL LETTER O WITH TILDE */
135     { 'O', 0x0308, 0xd6}, /* LATIN CAPITAL LETTER O WITH DIAERESIS */
136     /* omitted:    0xd7      MULTIPLICATION SIGN */
137     /* omitted:    0xd8      LATIN CAPITAL LETTER O WITH STROKE */
138     { 'U', 0x0300, 0xd9}, /* LATIN CAPITAL LETTER U WITH GRAVE */
139     { 'U', 0x0301, 0xda}, /* LATIN CAPITAL LETTER U WITH ACUTE */
140     { 'U', 0x0302, 0xdb}, /* LATIN CAPITAL LETTER U WITH CIRCUMFLEX */
141     { 'U', 0x0308, 0xdc}, /* LATIN CAPITAL LETTER U WITH DIAERESIS */
142     { 'Y', 0x0301, 0xdd}, /* LATIN CAPITAL LETTER Y WITH ACUTE */
143     /* omitted:    0xde      LATIN CAPITAL LETTER THORN */
144     /* omitted:    0xdf      LATIN SMALL LETTER SHARP S */
145     { 'a', 0x0300, 0xe0}, /* LATIN SMALL LETTER A WITH GRAVE */
146     { 'a', 0x0301, 0xe1}, /* LATIN SMALL LETTER A WITH ACUTE */
147     { 'a', 0x0302, 0xe2}, /* LATIN SMALL LETTER A WITH CIRCUMFLEX */
148     { 'a', 0x0303, 0xe3}, /* LATIN SMALL LETTER A WITH TILDE */
149     { 'a', 0x0308, 0xe4}, /* LATIN SMALL LETTER A WITH DIAERESIS */
150     { 'a', 0x030a, 0xe5}, /* LATIN SMALL LETTER A WITH RING ABOVE */
151     /* omitted:    0xe6      LATIN SMALL LETTER AE */
152     { 'c', 0x0327, 0xe7}, /* LATIN SMALL LETTER C WITH CEDILLA */
153     { 'e', 0x0300, 0xe8}, /* LATIN SMALL LETTER E WITH GRAVE */
154     { 'e', 0x0301, 0xe9}, /* LATIN SMALL LETTER E WITH ACUTE */
155     { 'e', 0x0302, 0xea}, /* LATIN SMALL LETTER E WITH CIRCUMFLEX */
156     { 'e', 0x0308, 0xeb}, /* LATIN SMALL LETTER E WITH DIAERESIS */
157     { 'i', 0x0300, 0xec}, /* LATIN SMALL LETTER I WITH GRAVE */
158     { 'i', 0x0301, 0xed}, /* LATIN SMALL LETTER I WITH ACUTE */
159     { 'i', 0x0302, 0xee}, /* LATIN SMALL LETTER I WITH CIRCUMFLEX */
160     { 'i', 0x0308, 0xef}, /* LATIN SMALL LETTER I WITH DIAERESIS */
161     /* omitted:    0xf0      LATIN SMALL LETTER ETH */
162     { 'n', 0x0303, 0xf1}, /* LATIN SMALL LETTER N WITH TILDE */
163     { 'o', 0x0300, 0xf2}, /* LATIN SMALL LETTER O WITH GRAVE */
164     { 'o', 0x0301, 0xf3}, /* LATIN SMALL LETTER O WITH ACUTE */
165     { 'o', 0x0302, 0xf4}, /* LATIN SMALL LETTER O WITH CIRCUMFLEX */
166     { 'o', 0x0303, 0xf5}, /* LATIN SMALL LETTER O WITH TILDE */
167     { 'o', 0x0308, 0xf6}, /* LATIN SMALL LETTER O WITH DIAERESIS */
168     /* omitted:    0xf7      DIVISION SIGN */
169     /* omitted:    0xf8      LATIN SMALL LETTER O WITH STROKE */
170     { 'u', 0x0300, 0xf9}, /* LATIN SMALL LETTER U WITH GRAVE */
171     { 'u', 0x0301, 0xfa}, /* LATIN SMALL LETTER U WITH ACUTE */
172     { 'u', 0x0302, 0xfb}, /* LATIN SMALL LETTER U WITH CIRCUMFLEX */
173     { 'u', 0x0308, 0xfc}, /* LATIN SMALL LETTER U WITH DIAERESIS */
174     { 'y', 0x0301, 0xfd}, /* LATIN SMALL LETTER Y WITH ACUTE */
175     /* omitted:    0xfe      LATIN SMALL LETTER THORN */
176     { 'y', 0x0308, 0xff}, /* LATIN SMALL LETTER Y WITH DIAERESIS */
177     
178     { 0, 0, 0}
179 };
180
181 static unsigned long yaz_read_ISO8859_1 (yaz_iconv_t cd, unsigned char *inp,
182                                          size_t inbytesleft, size_t *no_read)
183 {
184     unsigned long x = inp[0];
185     *no_read = 1;
186     return x;
187 }
188
189
190 static size_t yaz_init_UTF8 (yaz_iconv_t cd, unsigned char *inp,
191                              size_t inbytesleft, size_t *no_read)
192 {
193     if (inp[0] != 0xef)
194     {
195         *no_read = 0;
196         return 0;
197     }
198     if (inbytesleft < 3)
199     {
200         cd->my_errno = YAZ_ICONV_EINVAL;
201         return (size_t) -1;
202     }
203     if (inp[1] != 0xbb && inp[2] == 0xbf)
204         *no_read = 3;
205     else
206         *no_read = 0;
207     return 0;
208 }
209
210 unsigned long yaz_read_UTF8_char(unsigned char *inp,
211                                  size_t inbytesleft, size_t *no_read,
212                                  int *error)
213 {
214     unsigned long x = 0;
215
216     if (inp[0] <= 0x7f)
217     {
218         x = inp[0];
219         *no_read = 1;
220     }
221     else if (inp[0] <= 0xbf || inp[0] >= 0xfe)
222     {
223         *no_read = 0;
224         *error = YAZ_ICONV_EILSEQ;
225     }
226     else if (inp[0] <= 0xdf && inbytesleft >= 2)
227     {
228         x = ((inp[0] & 0x1f) << 6) | (inp[1] & 0x3f);
229         if (x >= 0x80)
230             *no_read = 2;
231         else
232         {
233             *no_read = 0;
234             *error = YAZ_ICONV_EILSEQ;
235         }
236     }
237     else if (inp[0] <= 0xef && inbytesleft >= 3)
238     {
239         x = ((inp[0] & 0x0f) << 12) | ((inp[1] & 0x3f) << 6) |
240             (inp[2] & 0x3f);
241         if (x >= 0x800)
242             *no_read = 3;
243         else
244         {
245             *no_read = 0;
246             *error = YAZ_ICONV_EILSEQ;
247         }
248     }
249     else if (inp[0] <= 0xf7 && inbytesleft >= 4)
250     {
251         x =  ((inp[0] & 0x07) << 18) | ((inp[1] & 0x3f) << 12) |
252             ((inp[2] & 0x3f) << 6) | (inp[3] & 0x3f);
253         if (x >= 0x10000)
254             *no_read = 4;
255         else
256         {
257             *no_read = 0;
258             *error = YAZ_ICONV_EILSEQ;
259         }
260     }
261     else if (inp[0] <= 0xfb && inbytesleft >= 5)
262     {
263         x =  ((inp[0] & 0x03) << 24) | ((inp[1] & 0x3f) << 18) |
264             ((inp[2] & 0x3f) << 12) | ((inp[3] & 0x3f) << 6) |
265             (inp[4] & 0x3f);
266         if (x >= 0x200000)
267             *no_read = 5;
268         else
269         {
270             *no_read = 0;
271             *error = YAZ_ICONV_EILSEQ;
272         }
273     }
274     else if (inp[0] <= 0xfd && inbytesleft >= 6)
275     {
276         x =  ((inp[0] & 0x01) << 30) | ((inp[1] & 0x3f) << 24) |
277             ((inp[2] & 0x3f) << 18) | ((inp[3] & 0x3f) << 12) |
278             ((inp[4] & 0x3f) << 6) | (inp[5] & 0x3f);
279         if (x >= 0x4000000)
280             *no_read = 6;
281         else
282         {
283             *no_read = 0;
284             *error = YAZ_ICONV_EILSEQ;
285         }
286     }
287     else
288     {
289         *no_read = 0;
290         *error = YAZ_ICONV_EINVAL;
291     }
292     return x;
293 }
294
295 static unsigned long yaz_read_UTF8 (yaz_iconv_t cd, unsigned char *inp,
296                                     size_t inbytesleft, size_t *no_read)
297 {
298     return yaz_read_UTF8_char(inp, inbytesleft, no_read, &cd->my_errno);
299 }
300
301 static unsigned long yaz_read_UCS4 (yaz_iconv_t cd, unsigned char *inp,
302                                     size_t inbytesleft, size_t *no_read)
303 {
304     unsigned long x = 0;
305     
306     if (inbytesleft < 4)
307     {
308         cd->my_errno = YAZ_ICONV_EINVAL; /* incomplete input */
309         *no_read = 0;
310     }
311     else
312     {
313         x = (inp[0]<<24) | (inp[1]<<16) | (inp[2]<<8) | inp[3];
314         *no_read = 4;
315     }
316     return x;
317 }
318
319 static unsigned long yaz_read_UCS4LE (yaz_iconv_t cd, unsigned char *inp,
320                                       size_t inbytesleft, size_t *no_read)
321 {
322     unsigned long x = 0;
323     
324     if (inbytesleft < 4)
325     {
326         cd->my_errno = YAZ_ICONV_EINVAL; /* incomplete input */
327         *no_read = 0;
328     }
329     else
330     {
331         x = (inp[3]<<24) | (inp[2]<<16) | (inp[1]<<8) | inp[0];
332         *no_read = 4;
333     }
334     return x;
335 }
336
337 #if HAVE_WCHAR_H
338 static unsigned long yaz_read_wchar_t (yaz_iconv_t cd, unsigned char *inp,
339                                        size_t inbytesleft, size_t *no_read)
340 {
341     unsigned long x = 0;
342     
343     if (inbytesleft < sizeof(wchar_t))
344     {
345         cd->my_errno = YAZ_ICONV_EINVAL; /* incomplete input */
346         *no_read = 0;
347     }
348     else
349     {
350         wchar_t wch;
351         memcpy (&wch, inp, sizeof(wch));
352         x = wch;
353         *no_read = sizeof(wch);
354     }
355     return x;
356 }
357 #endif
358
359
360 static unsigned long yaz_read_marc8_comb (yaz_iconv_t cd, unsigned char *inp,
361                                           size_t inbytesleft, size_t *no_read,
362                                           int *comb);
363
364 static unsigned long yaz_read_marc8 (yaz_iconv_t cd, unsigned char *inp,
365                                      size_t inbytesleft, size_t *no_read)
366 {
367     unsigned long x;
368     if (cd->comb_offset < cd->comb_size)
369     {
370         *no_read = cd->comb_no_read[cd->comb_offset];
371         x = cd->comb_x[cd->comb_offset];
372
373         /* special case for double-diacritic combining characters, 
374            INVERTED BREVE and DOUBLE TILDE.
375            We'll increment the no_read counter by 1, since we want to skip over
376            the processing of the closing ligature character
377         */
378         /* this code is no longer necessary.. our handlers code in
379            yaz_marc8_?_conv (generated by charconv.tcl) now returns
380            0 and no_read=1 when a sequence does not match the input.
381            The SECOND HALFs in codetables.xml produces a non-existant
382            entry in the conversion trie.. Hence when met, the input byte is
383            skipped as it should (in yaz_iconv)
384         */
385 #if 0
386         if (x == 0x0361 || x == 0x0360)
387             *no_read += 1;
388 #endif
389         cd->comb_offset++;
390         return x;
391     }
392
393     cd->comb_offset = 0;
394     for (cd->comb_size = 0; cd->comb_size < 8; cd->comb_size++)
395     {
396         int comb = 0;
397         x = yaz_read_marc8_comb(cd, inp, inbytesleft, no_read, &comb);
398         if (!comb || !x)
399             break;
400         cd->comb_x[cd->comb_size] = x;
401         cd->comb_no_read[cd->comb_size] = *no_read;
402         inp += *no_read;
403         inbytesleft = inbytesleft - *no_read;
404     }
405     return x;
406 }
407
408 static unsigned long yaz_read_marc8s(yaz_iconv_t cd, unsigned char *inp,
409                                      size_t inbytesleft, size_t *no_read)
410 {
411     unsigned long x = yaz_read_marc8(cd, inp, inbytesleft, no_read);
412     if (x && cd->comb_size == 1)
413     {
414         /* For MARC8s we try to get a Latin-1 page code out of it */
415         int i;
416         for (i = 0; latin1_comb[i].x1; i++)
417             if (cd->comb_x[0] == latin1_comb[i].x2 && x == latin1_comb[i].x1)
418             {
419                 *no_read += cd->comb_no_read[0];
420                 cd->comb_size = 0;
421                 x = latin1_comb[i].y;
422                 break;
423             }
424     }
425     return x;
426 }
427
428 static unsigned long yaz_read_marc8_comb(yaz_iconv_t cd, unsigned char *inp,
429                                          size_t inbytesleft, size_t *no_read,
430                                          int *comb)
431 {
432     *no_read = 0;
433     while(inbytesleft >= 1 && inp[0] == 27)
434     {
435         size_t inbytesleft0 = inbytesleft;
436         inp++;
437         inbytesleft--;
438         while(inbytesleft > 0 && strchr("(,$!)-", *inp))
439         {
440             inbytesleft--;
441             inp++;
442         }
443         if (inbytesleft <= 0)
444         {
445             *no_read = 0;
446             cd->my_errno = YAZ_ICONV_EINVAL;
447             return 0;
448         }
449         cd->marc8_esc_mode = *inp++;
450         inbytesleft--;
451         (*no_read) += inbytesleft0 - inbytesleft;
452     }
453     if (inbytesleft <= 0)
454         return 0;
455     else
456     {
457         unsigned long x;
458         size_t no_read_sub = 0;
459         *comb = 0;
460
461         switch(cd->marc8_esc_mode)
462         {
463         case 'B':  /* Basic ASCII */
464         case 'E':  /* ANSEL */
465         case 's':  /* ASCII */
466             x = yaz_marc8_1_conv(inp, inbytesleft, &no_read_sub, comb);
467             break;
468         case 'g':  /* Greek */
469             x = yaz_marc8_2_conv(inp, inbytesleft, &no_read_sub, comb);
470             break;
471         case 'b':  /* Subscripts */
472             x = yaz_marc8_3_conv(inp, inbytesleft, &no_read_sub, comb);
473             break;
474         case 'p':  /* Superscripts */
475             x = yaz_marc8_4_conv(inp, inbytesleft, &no_read_sub, comb);
476             break;
477         case '2':  /* Basic Hebrew */
478             x = yaz_marc8_5_conv(inp, inbytesleft, &no_read_sub, comb);
479             break;
480         case 'N':  /* Basic Cyrillic */
481         case 'Q':  /* Extended Cyrillic */
482             x = yaz_marc8_6_conv(inp, inbytesleft, &no_read_sub, comb);
483             break;
484         case '3':  /* Basic Arabic */
485         case '4':  /* Extended Arabic */
486             x = yaz_marc8_7_conv(inp, inbytesleft, &no_read_sub, comb);
487             break;
488         case 'S':  /* Greek */
489             x = yaz_marc8_8_conv(inp, inbytesleft, &no_read_sub, comb);
490             break;
491         case '1':  /* Chinese, Japanese, Korean (EACC) */
492             x = yaz_marc8_9_conv(inp, inbytesleft, &no_read_sub, comb);
493             break;
494         default:
495             *no_read = 0;
496             cd->my_errno = YAZ_ICONV_EILSEQ;
497             return 0;
498         }
499         *no_read += no_read_sub;
500         return x;
501     }
502 }
503
504 static size_t yaz_write_UTF8(yaz_iconv_t cd, unsigned long x,
505                              char **outbuf, size_t *outbytesleft,
506                              int last)
507 {
508     return yaz_write_UTF8_char(x, outbuf, outbytesleft, &cd->my_errno);
509 }
510
511 size_t yaz_write_UTF8_char(unsigned long x,
512                            char **outbuf, size_t *outbytesleft,
513                            int *error)
514 {
515     unsigned char *outp = (unsigned char *) *outbuf;
516
517     if (x <= 0x7f && *outbytesleft >= 1)
518     {
519         *outp++ = (unsigned char) x;
520         (*outbytesleft)--;
521     } 
522     else if (x <= 0x7ff && *outbytesleft >= 2)
523     {
524         *outp++ = (unsigned char) ((x >> 6) | 0xc0);
525         *outp++ = (unsigned char) ((x & 0x3f) | 0x80);
526         (*outbytesleft) -= 2;
527     }
528     else if (x <= 0xffff && *outbytesleft >= 3)
529     {
530         *outp++ = (unsigned char) ((x >> 12) | 0xe0);
531         *outp++ = (unsigned char) (((x >> 6) & 0x3f) | 0x80);
532         *outp++ = (unsigned char) ((x & 0x3f) | 0x80);
533         (*outbytesleft) -= 3;
534     }
535     else if (x <= 0x1fffff && *outbytesleft >= 4)
536     {
537         *outp++ = (unsigned char) ((x >> 18) | 0xf0);
538         *outp++ = (unsigned char) (((x >> 12) & 0x3f) | 0x80);
539         *outp++ = (unsigned char) (((x >> 6)  & 0x3f) | 0x80);
540         *outp++ = (unsigned char) ((x & 0x3f) | 0x80);
541         (*outbytesleft) -= 4;
542     }
543     else if (x <= 0x3ffffff && *outbytesleft >= 5)
544     {
545         *outp++ = (unsigned char) ((x >> 24) | 0xf8);
546         *outp++ = (unsigned char) (((x >> 18) & 0x3f) | 0x80);
547         *outp++ = (unsigned char) (((x >> 12) & 0x3f) | 0x80);
548         *outp++ = (unsigned char) (((x >> 6)  & 0x3f) | 0x80);
549         *outp++ = (unsigned char) ((x & 0x3f) | 0x80);
550         (*outbytesleft) -= 5;
551     }
552     else if (*outbytesleft >= 6)
553     {
554         *outp++ = (unsigned char) ((x >> 30) | 0xfc);
555         *outp++ = (unsigned char) (((x >> 24) & 0x3f) | 0x80);
556         *outp++ = (unsigned char) (((x >> 18) & 0x3f) | 0x80);
557         *outp++ = (unsigned char) (((x >> 12) & 0x3f) | 0x80);
558         *outp++ = (unsigned char) (((x >> 6)  & 0x3f) | 0x80);
559         *outp++ = (unsigned char) ((x & 0x3f) | 0x80);
560         (*outbytesleft) -= 6;
561     }
562     else 
563     {
564         *error = YAZ_ICONV_E2BIG;  /* not room for output */
565         return (size_t)(-1);
566     }
567     *outbuf = (char *) outp;
568     return 0;
569 }
570
571
572 static size_t yaz_write_ISO8859_1 (yaz_iconv_t cd, unsigned long x,
573                                    char **outbuf, size_t *outbytesleft,
574                                    int last)
575 {
576     /* list of two char unicode sequence that, when combined, are
577        equivalent to single unicode chars that can be represented in
578        ISO-8859-1/Latin-1.
579        Regular iconv on Linux at least does not seem to convert these,
580        but since MARC-8 to UTF-8 generates these composed sequence
581        we get a better chance of a successful MARC-8 -> ISO-8859-1
582        conversion */
583     unsigned char *outp = (unsigned char *) *outbuf;
584
585     if (cd->compose_char)
586     {
587         int i;
588         for (i = 0; latin1_comb[i].x1; i++)
589             if (cd->compose_char == latin1_comb[i].x1 && x == latin1_comb[i].x2)
590             {
591                 x = latin1_comb[i].y;
592                 break;
593             }
594         if (*outbytesleft < 1)
595         {  /* no room. Retain compose_char and bail out */
596             cd->my_errno = YAZ_ICONV_E2BIG;
597             return (size_t)(-1);
598         }
599         if (!latin1_comb[i].x1) 
600         {   /* not found. Just write compose_char */
601             *outp++ = (unsigned char) cd->compose_char;
602             (*outbytesleft)--;
603             *outbuf = (char *) outp;
604         }
605         /* compose_char used so reset it. x now holds current char */
606         cd->compose_char = 0;
607     }
608
609     if (!last && x > 32 && x < 127 && cd->compose_char == 0)
610     {
611         cd->compose_char = x;
612         return 0;
613     }
614     else if (x > 255 || x < 1)
615     {
616         cd->my_errno = YAZ_ICONV_EILSEQ;
617         return (size_t) -1;
618     }
619     else if (*outbytesleft < 1)
620     {
621         cd->my_errno = YAZ_ICONV_E2BIG;
622         return (size_t)(-1);
623     }
624     *outp++ = (unsigned char) x;
625     (*outbytesleft)--;
626     *outbuf = (char *) outp;
627     return 0;
628 }
629
630
631 static size_t yaz_write_UCS4 (yaz_iconv_t cd, unsigned long x,
632                               char **outbuf, size_t *outbytesleft,
633                               int last)
634 {
635     unsigned char *outp = (unsigned char *) *outbuf;
636     if (*outbytesleft >= 4)
637     {
638         *outp++ = (unsigned char) (x>>24);
639         *outp++ = (unsigned char) (x>>16);
640         *outp++ = (unsigned char) (x>>8);
641         *outp++ = (unsigned char) x;
642         (*outbytesleft) -= 4;
643     }
644     else
645     {
646         cd->my_errno = YAZ_ICONV_E2BIG;
647         return (size_t)(-1);
648     }
649     *outbuf = (char *) outp;
650     return 0;
651 }
652
653 static size_t yaz_write_UCS4LE (yaz_iconv_t cd, unsigned long x,
654                                 char **outbuf, size_t *outbytesleft,
655                                 int last)
656 {
657     unsigned char *outp = (unsigned char *) *outbuf;
658     if (*outbytesleft >= 4)
659     {
660         *outp++ = (unsigned char) x;
661         *outp++ = (unsigned char) (x>>8);
662         *outp++ = (unsigned char) (x>>16);
663         *outp++ = (unsigned char) (x>>24);
664         (*outbytesleft) -= 4;
665     }
666     else
667     {
668         cd->my_errno = YAZ_ICONV_E2BIG;
669         return (size_t)(-1);
670     }
671     *outbuf = (char *) outp;
672     return 0;
673 }
674
675 static unsigned long lookup_marc8(yaz_iconv_t cd,
676                                   unsigned long x, int *comb,
677                                   const char **page_chr)
678 {
679     char utf8_buf[7];
680     char *utf8_outbuf = utf8_buf;
681     size_t utf8_outbytesleft = sizeof(utf8_buf)-1, r;
682
683     r = yaz_write_UTF8(cd, x, &utf8_outbuf, &utf8_outbytesleft, 0);
684     if (r == (size_t)(-1))
685     {
686         cd->my_errno = YAZ_ICONV_EILSEQ;
687         return 0;
688     }
689     else
690     {
691         unsigned char *inp;
692         size_t inbytesleft, no_read_sub = 0;
693         unsigned long x;
694
695         *utf8_outbuf = '\0';        
696         inp = (unsigned char *) utf8_buf;
697         inbytesleft = strlen(utf8_buf);
698         
699         x = yaz_marc8r_1_conv(inp, inbytesleft, &no_read_sub, comb);
700         if (x)
701         {
702             *page_chr = "\033(B";
703             return x;
704         }
705         x = yaz_marc8r_2_conv(inp, inbytesleft, &no_read_sub, comb);
706         if (x)
707         {
708             *page_chr = "\033g";
709             return x;
710         }
711         x = yaz_marc8r_3_conv(inp, inbytesleft, &no_read_sub, comb);
712         if (x)
713         {
714             *page_chr = "\033b";
715             return x;
716         }
717         x = yaz_marc8r_4_conv(inp, inbytesleft, &no_read_sub, comb);
718         if (x)
719         {
720             *page_chr = "\033p";
721             return x;
722         }
723         x = yaz_marc8r_5_conv(inp, inbytesleft, &no_read_sub, comb);
724         if (x)
725         {
726             *page_chr = "\033(2";
727             return x;
728         }
729         x = yaz_marc8r_6_conv(inp, inbytesleft, &no_read_sub, comb);
730         if (x)
731         {
732             *page_chr = "\033(N";
733             return x;
734         }
735         x = yaz_marc8r_7_conv(inp, inbytesleft, &no_read_sub, comb);
736         if (x)
737         {
738             *page_chr = "\033(3";
739             return x;
740         }
741         x = yaz_marc8r_8_conv(inp, inbytesleft, &no_read_sub, comb);
742         if (x)
743         {
744             *page_chr = "\033(S";
745             return x;
746         }
747         x = yaz_marc8r_9_conv(inp, inbytesleft, &no_read_sub, comb);
748         if (x)
749         {
750             *page_chr = "\033$1";
751             return x;
752         }
753         cd->my_errno = YAZ_ICONV_EILSEQ;
754         return x;
755     }
756 }
757
758 static size_t flush_combos(yaz_iconv_t cd,
759                            char **outbuf, size_t *outbytesleft)
760 {
761     unsigned long y = cd->write_marc8_last;
762     unsigned char byte;
763     char out_buf[10];
764     size_t i, out_no = 0;
765
766     if (!y)
767         return 0;
768
769     byte = (unsigned char )((y>>16) & 0xff);
770     if (byte)
771         out_buf[out_no++] = byte;
772     byte = (unsigned char)((y>>8) & 0xff);
773     if (byte)
774         out_buf[out_no++] = byte;
775     byte = (unsigned char )(y & 0xff);
776     if (byte)
777         out_buf[out_no++] = byte;
778
779     if (out_no + cd->write_marc8_comb_no + 1 > *outbytesleft)
780     {
781         cd->my_errno = YAZ_ICONV_E2BIG;
782         return (size_t) (-1);
783     }
784
785     for (i = 0; i < cd->write_marc8_comb_no; i++)
786     {
787         /* all MARC-8 combined characters are simple bytes */
788         byte = (unsigned char )(cd->write_marc8_comb_ch[i]);
789         *(*outbuf)++ = byte;
790         (*outbytesleft)--;
791     }
792     memcpy(*outbuf, out_buf, out_no);
793     *outbuf += out_no;
794     (*outbytesleft) -= out_no;
795     if (cd->write_marc8_second_half_char)
796     {
797         *(*outbuf)++ = cd->write_marc8_second_half_char;
798         (*outbytesleft)--;
799     }        
800
801     cd->write_marc8_last = 0;
802     cd->write_marc8_comb_no = 0;
803     cd->write_marc8_second_half_char = 0;
804     return 0;
805 }
806
807 static size_t yaz_write_marc8_2(yaz_iconv_t cd, unsigned long x,
808                                 char **outbuf, size_t *outbytesleft,
809                                 int last)
810 {
811     int comb = 0;
812     const char *page_chr = 0;
813     unsigned long y = lookup_marc8(cd, x, &comb, &page_chr);
814
815     if (!y)
816         return (size_t) (-1);
817
818     if (comb)
819     {
820         if (x == 0x0361)
821             cd->write_marc8_second_half_char = 0xEC;
822         else if (x == 0x0360)
823             cd->write_marc8_second_half_char = 0xFB;
824
825         if (cd->write_marc8_comb_no < 6)
826             cd->write_marc8_comb_ch[cd->write_marc8_comb_no++] = y;
827     }
828     else
829     {
830         size_t r = flush_combos(cd, outbuf, outbytesleft);
831         const char *old_page_chr = cd->write_marc8_page_chr;
832         if (r)
833             return r;
834         if (strcmp(page_chr, old_page_chr))
835         {
836             size_t plen = 0;
837             const char *page_out = page_chr;
838
839             if (*outbytesleft < 8)
840             {
841                 cd->my_errno = YAZ_ICONV_E2BIG;
842                 
843                 return (size_t) (-1);
844             }
845             cd->write_marc8_page_chr = page_chr;
846
847             if (!strcmp(old_page_chr, "\033p") 
848                 || !strcmp(old_page_chr, "\033g")
849                 || !strcmp(old_page_chr, "\033b"))
850             {
851                 /* Technique 1 leave */
852                 page_out = "\033s";
853                 if (strcmp(page_chr, "\033(B")) /* Not going ASCII page? */
854                 {
855                     /* Must leave script + enter new page */
856                     plen = strlen(page_out);
857                     memcpy(*outbuf, page_out, plen);
858                     (*outbuf) += plen;
859                     (*outbytesleft) -= plen;
860                     page_out = page_chr;
861                 }
862             }
863             plen = strlen(page_out);
864             memcpy(*outbuf, page_out, plen);
865             (*outbuf) += plen;
866             (*outbytesleft) -= plen;
867         }
868         cd->write_marc8_last = y;
869     }
870     if (last)
871     {
872         size_t r = flush_combos(cd, outbuf, outbytesleft);
873         if (r)
874         {
875             if (comb)
876                 cd->write_marc8_comb_no--;
877             else
878                 cd->write_marc8_last = 0;
879             return r;
880         }
881     }
882     return 0;
883 }
884
885 static size_t yaz_flush_marc8(yaz_iconv_t cd,
886                               char **outbuf, size_t *outbytesleft)
887 {
888     if (strcmp(cd->write_marc8_page_chr, "\033(B"))
889     {
890         if (*outbytesleft < 3)
891         {
892             cd->my_errno = YAZ_ICONV_E2BIG;
893             return (size_t) (-1);
894         }
895         memcpy(*outbuf, "\033(B", 3);
896         (*outbuf) += 3;
897         *outbytesleft -= 3;
898     }
899     return 0;
900 }
901
902 static size_t yaz_write_marc8(yaz_iconv_t cd, unsigned long x,
903                               char **outbuf, size_t *outbytesleft,
904                               int last)
905 {
906     int i;
907     for (i = 0; latin1_comb[i].x1; i++)
908     {
909         if (x == latin1_comb[i].y)
910         {
911             size_t r ;
912             /* save the output pointers .. */
913             char *outbuf0 = *outbuf;
914             size_t outbytesleft0 = *outbytesleft;
915             int last_ch = cd->write_marc8_last;
916
917             r = yaz_write_marc8_2(cd, latin1_comb[i].x1,
918                                   outbuf, outbytesleft, 0);
919             if (r)
920                 return r;
921             r = yaz_write_marc8_2(cd, latin1_comb[i].x2,
922                                   outbuf, outbytesleft, last);
923             if (r && cd->my_errno == YAZ_ICONV_E2BIG)
924             {
925                 /* not enough room. reset output to original values */
926                 *outbuf = outbuf0;
927                 *outbytesleft = outbytesleft0;
928                 cd->write_marc8_last = last_ch;
929             }
930             return r;
931         }
932     }
933     return yaz_write_marc8_2(cd, x, outbuf, outbytesleft, last);
934 }
935
936
937 #if HAVE_WCHAR_H
938 static size_t yaz_write_wchar_t (yaz_iconv_t cd, unsigned long x,
939                                  char **outbuf, size_t *outbytesleft,
940                                  int last)
941 {
942     unsigned char *outp = (unsigned char *) *outbuf;
943
944     if (*outbytesleft >= sizeof(wchar_t))
945     {
946         wchar_t wch = x;
947         memcpy(outp, &wch, sizeof(wch));
948         outp += sizeof(wch);
949         (*outbytesleft) -= sizeof(wch);
950     }
951     else
952     {
953         cd->my_errno = YAZ_ICONV_E2BIG;
954         return (size_t)(-1);
955     }
956     *outbuf = (char *) outp;
957     return 0;
958 }
959 #endif
960
961 int yaz_iconv_isbuiltin(yaz_iconv_t cd)
962 {
963     return cd->read_handle && cd->write_handle;
964 }
965
966 yaz_iconv_t yaz_iconv_open (const char *tocode, const char *fromcode)
967 {
968     yaz_iconv_t cd = (yaz_iconv_t) xmalloc (sizeof(*cd));
969
970     cd->write_handle = 0;
971     cd->read_handle = 0;
972     cd->init_handle = 0;
973     cd->flush_handle = 0;
974     cd->my_errno = YAZ_ICONV_UNKNOWN;
975
976     /* a useful hack: if fromcode has leading @,
977        the library not use YAZ's own conversions .. */
978     if (fromcode[0] == '@')
979         fromcode++;
980     else
981     {
982         if (!yaz_matchstr(fromcode, "UTF8"))
983         {
984             cd->read_handle = yaz_read_UTF8;
985             cd->init_handle = yaz_init_UTF8;
986         }
987         else if (!yaz_matchstr(fromcode, "ISO88591"))
988             cd->read_handle = yaz_read_ISO8859_1;
989         else if (!yaz_matchstr(fromcode, "UCS4"))
990             cd->read_handle = yaz_read_UCS4;
991         else if (!yaz_matchstr(fromcode, "UCS4LE"))
992             cd->read_handle = yaz_read_UCS4LE;
993         else if (!yaz_matchstr(fromcode, "MARC8"))
994             cd->read_handle = yaz_read_marc8;
995         else if (!yaz_matchstr(fromcode, "MARC8s"))
996             cd->read_handle = yaz_read_marc8s;
997 #if HAVE_WCHAR_H
998         else if (!yaz_matchstr(fromcode, "WCHAR_T"))
999             cd->read_handle = yaz_read_wchar_t;
1000 #endif
1001         
1002         if (!yaz_matchstr(tocode, "UTF8"))
1003             cd->write_handle = yaz_write_UTF8;
1004         else if (!yaz_matchstr(tocode, "ISO88591"))
1005             cd->write_handle = yaz_write_ISO8859_1;
1006         else if (!yaz_matchstr (tocode, "UCS4"))
1007             cd->write_handle = yaz_write_UCS4;
1008         else if (!yaz_matchstr(tocode, "UCS4LE"))
1009             cd->write_handle = yaz_write_UCS4LE;
1010         else if (!yaz_matchstr(tocode, "MARC8"))
1011         {
1012             cd->write_handle = yaz_write_marc8;
1013             cd->flush_handle = yaz_flush_marc8;
1014         }
1015         else if (!yaz_matchstr(tocode, "MARC8s"))
1016         {
1017             cd->write_handle = yaz_write_marc8;
1018             cd->flush_handle = yaz_flush_marc8;
1019         }
1020 #if HAVE_WCHAR_H
1021         else if (!yaz_matchstr(tocode, "WCHAR_T"))
1022             cd->write_handle = yaz_write_wchar_t;
1023 #endif
1024     }
1025 #if HAVE_ICONV_H
1026     cd->iconv_cd = 0;
1027     if (!cd->read_handle || !cd->write_handle)
1028     {
1029         cd->iconv_cd = iconv_open (tocode, fromcode);
1030         if (cd->iconv_cd == (iconv_t) (-1))
1031         {
1032             xfree (cd);
1033             return 0;
1034         }
1035     }
1036 #else
1037     if (!cd->read_handle || !cd->write_handle)
1038     {
1039         xfree (cd);
1040         return 0;
1041     }
1042 #endif
1043     cd->init_flag = 1;
1044     return cd;
1045 }
1046
1047 size_t yaz_iconv(yaz_iconv_t cd, char **inbuf, size_t *inbytesleft,
1048                  char **outbuf, size_t *outbytesleft)
1049 {
1050     char *inbuf0 = 0;
1051     size_t r = 0;
1052
1053 #if HAVE_ICONV_H
1054     if (cd->iconv_cd)
1055     {
1056         size_t r =
1057             iconv(cd->iconv_cd, inbuf, inbytesleft, outbuf, outbytesleft);
1058         if (r == (size_t)(-1))
1059         {
1060             switch (yaz_errno())
1061             {
1062             case E2BIG:
1063                 cd->my_errno = YAZ_ICONV_E2BIG;
1064                 break;
1065             case EINVAL:
1066                 cd->my_errno = YAZ_ICONV_EINVAL;
1067                 break;
1068             case EILSEQ:
1069                 cd->my_errno = YAZ_ICONV_EILSEQ;
1070                 break;
1071             default:
1072                 cd->my_errno = YAZ_ICONV_UNKNOWN;
1073             }
1074         }
1075         return r;
1076     }
1077 #endif
1078
1079     if (inbuf)
1080         inbuf0 = *inbuf;
1081
1082     if (cd->init_flag)
1083     {
1084         cd->my_errno = YAZ_ICONV_UNKNOWN;
1085         cd->marc8_esc_mode = 'B';
1086         
1087         cd->comb_offset = cd->comb_size = 0;
1088         cd->compose_char = 0;
1089         
1090         cd->write_marc8_comb_no = 0;
1091         cd->write_marc8_second_half_char = 0;
1092         cd->write_marc8_last = 0;
1093         cd->write_marc8_page_chr = "\033(B";
1094         
1095         cd->unget_x = 0;
1096         cd->no_read_x = 0;
1097     }
1098
1099     if (cd->init_flag)
1100     {
1101         if (cd->init_handle && inbuf && *inbuf)
1102         {
1103             size_t no_read = 0;
1104             size_t r = (cd->init_handle)(cd, (unsigned char *) *inbuf,
1105                                          *inbytesleft, &no_read);
1106             if (r)
1107             {
1108                 if (cd->my_errno == YAZ_ICONV_EINVAL)
1109                     return r;
1110                 cd->init_flag = 0;
1111                 return r;
1112             }
1113             *inbytesleft -= no_read;
1114             *inbuf += no_read;
1115         }
1116     }
1117     cd->init_flag = 0;
1118
1119     while (1)
1120     {
1121         unsigned long x;
1122         size_t no_read;
1123
1124         if (cd->unget_x)
1125         {
1126             x = cd->unget_x;
1127             no_read = cd->no_read_x;
1128         }
1129         else if (inbuf && *inbuf)
1130         {
1131             if (*inbytesleft == 0)
1132             {
1133                 r = *inbuf - inbuf0;
1134                 break;
1135             }
1136             x = (cd->read_handle)(cd, (unsigned char *) *inbuf, *inbytesleft,
1137                                   &no_read);
1138             if (no_read == 0)
1139             {
1140                 r = (size_t)(-1);
1141                 break;
1142             }
1143         }
1144         else
1145         {
1146             r = 0;
1147             if (cd->flush_handle && outbuf && *outbuf)
1148                 r = (*cd->flush_handle)(cd, outbuf, outbytesleft);
1149             if (r == 0)
1150                 cd->init_flag = 1;
1151             break;
1152         }
1153         if (x)
1154         {
1155             r = (cd->write_handle)(cd, x, outbuf, outbytesleft,
1156                                    (*inbytesleft - no_read) == 0 ? 1 : 0);
1157             if (r)
1158             {
1159                 /* unable to write it. save it because read_handle cannot
1160                    rewind .. */
1161                 if (cd->my_errno == YAZ_ICONV_E2BIG)
1162                 {
1163                     cd->unget_x = x;
1164                     cd->no_read_x = no_read;
1165                     break;
1166                 }
1167             }
1168             cd->unget_x = 0;
1169         }
1170         *inbytesleft -= no_read;
1171         (*inbuf) += no_read;
1172     }
1173     return r;
1174 }
1175
1176 int yaz_iconv_error (yaz_iconv_t cd)
1177 {
1178     return cd->my_errno;
1179 }
1180
1181 int yaz_iconv_close (yaz_iconv_t cd)
1182 {
1183 #if HAVE_ICONV_H
1184     if (cd->iconv_cd)
1185         iconv_close (cd->iconv_cd);
1186 #endif
1187     xfree (cd);
1188     return 0;
1189 }
1190
1191 /*
1192  * Local variables:
1193  * c-basic-offset: 4
1194  * indent-tabs-mode: nil
1195  * End:
1196  * vim: shiftwidth=4 tabstop=8 expandtab
1197  */
1198