Fixed yaz_iconv to return YAZ_ICONV_EINVAL if an incomplete MARC-8
[yaz-moved-to-github.git] / src / siconv.c
1 /*
2  * Copyright (C) 1995-2007, Index Data ApS
3  * See the file LICENSE for details.
4  *
5  * $Id: siconv.c,v 1.34 2007-03-09 08:39:38 adam Exp $
6  */
7 /**
8  * \file siconv.c
9  * \brief Implements simple ICONV
10  *
11  * This implements an interface similar to that of iconv and
12  * is used by YAZ to interface with iconv (if present).
13  * For systems where iconv is not present, this layer
14  * provides a few important conversions: UTF-8, MARC-8, Latin-1.
15  *
16  * MARC-8 reference:
17  *  http://www.loc.gov/marc/specifications/speccharmarc8.html
18  */
19
20 #if HAVE_CONFIG_H
21 #include <config.h>
22 #endif
23
24 #include <assert.h>
25 #include <errno.h>
26 #include <string.h>
27 #include <ctype.h>
28 #if HAVE_WCHAR_H
29 #include <wchar.h>
30 #endif
31
32 #if HAVE_ICONV_H
33 #include <iconv.h>
34 #endif
35
36
37 #include <yaz/yaz-util.h>
38
39 unsigned long yaz_marc8_1_conv(unsigned char *inp, size_t inbytesleft,
40                                size_t *no_read, int *combining);
41 unsigned long yaz_marc8_2_conv(unsigned char *inp, size_t inbytesleft,
42                                size_t *no_read, int *combining);
43 unsigned long yaz_marc8_3_conv(unsigned char *inp, size_t inbytesleft,
44                                size_t *no_read, int *combining);
45 unsigned long yaz_marc8_4_conv(unsigned char *inp, size_t inbytesleft,
46                                size_t *no_read, int *combining);
47 unsigned long yaz_marc8_5_conv(unsigned char *inp, size_t inbytesleft,
48                                size_t *no_read, int *combining);
49 unsigned long yaz_marc8_6_conv(unsigned char *inp, size_t inbytesleft,
50                                size_t *no_read, int *combining);
51 unsigned long yaz_marc8_7_conv(unsigned char *inp, size_t inbytesleft,
52                                size_t *no_read, int *combining);
53 unsigned long yaz_marc8_8_conv(unsigned char *inp, size_t inbytesleft,
54                                size_t *no_read, int *combining);
55 unsigned long yaz_marc8_9_conv(unsigned char *inp, size_t inbytesleft,
56                                size_t *no_read, int *combining);
57
58
59 unsigned long yaz_marc8r_1_conv(unsigned char *inp, size_t inbytesleft,
60                                 size_t *no_read, int *combining);
61 unsigned long yaz_marc8r_2_conv(unsigned char *inp, size_t inbytesleft,
62                                 size_t *no_read, int *combining);
63 unsigned long yaz_marc8r_3_conv(unsigned char *inp, size_t inbytesleft,
64                                 size_t *no_read, int *combining);
65 unsigned long yaz_marc8r_4_conv(unsigned char *inp, size_t inbytesleft,
66                                 size_t *no_read, int *combining);
67 unsigned long yaz_marc8r_5_conv(unsigned char *inp, size_t inbytesleft,
68                                 size_t *no_read, int *combining);
69 unsigned long yaz_marc8r_6_conv(unsigned char *inp, size_t inbytesleft,
70                                 size_t *no_read, int *combining);
71 unsigned long yaz_marc8r_7_conv(unsigned char *inp, size_t inbytesleft,
72                                 size_t *no_read, int *combining);
73 unsigned long yaz_marc8r_8_conv(unsigned char *inp, size_t inbytesleft,
74                                 size_t *no_read, int *combining);
75 unsigned long yaz_marc8r_9_conv(unsigned char *inp, size_t inbytesleft,
76                                 size_t *no_read, int *combining);
77
78 struct yaz_iconv_struct {
79     int my_errno;
80     int init_flag;
81     size_t (*init_handle)(yaz_iconv_t cd, unsigned char *inbuf,
82                           size_t inbytesleft, size_t *no_read);
83     unsigned long (*read_handle)(yaz_iconv_t cd, unsigned char *inbuf,
84                                  size_t inbytesleft, size_t *no_read);
85     size_t (*write_handle)(yaz_iconv_t cd, unsigned long x,
86                            char **outbuf, size_t *outbytesleft,
87                            int last);
88     size_t (*flush_handle)(yaz_iconv_t cd,
89                            char **outbuf, size_t *outbytesleft);
90     int marc8_esc_mode;
91
92     int comb_offset;
93     int comb_size;
94     unsigned long comb_x[8];
95     size_t comb_no_read[8];
96     size_t no_read_x;
97     unsigned long unget_x;
98 #if HAVE_ICONV_H
99     iconv_t iconv_cd;
100 #endif
101     unsigned long compose_char;
102
103     unsigned long write_marc8_comb_ch[8];
104     size_t write_marc8_comb_no;
105     unsigned write_marc8_second_half_char;
106     unsigned long write_marc8_last;
107     const char *write_marc8_page_chr;
108 };
109
110 static struct {
111     unsigned long x1, x2;
112     unsigned y;
113 } latin1_comb[] = {
114     { 'A', 0x0300, 0xc0}, /* LATIN CAPITAL LETTER A WITH GRAVE */
115     { 'A', 0x0301, 0xc1}, /* LATIN CAPITAL LETTER A WITH ACUTE */
116     { 'A', 0x0302, 0xc2}, /* LATIN CAPITAL LETTER A WITH CIRCUMFLEX */
117     { 'A', 0x0303, 0xc3}, /* LATIN CAPITAL LETTER A WITH TILDE */
118     { 'A', 0x0308, 0xc4}, /* LATIN CAPITAL LETTER A WITH DIAERESIS */
119     { 'A', 0x030a, 0xc5}, /* LATIN CAPITAL LETTER A WITH RING ABOVE */
120     /* no need for 0xc6      LATIN CAPITAL LETTER AE */
121     { 'C', 0x0327, 0xc7}, /* LATIN CAPITAL LETTER C WITH CEDILLA */
122     { 'E', 0x0300, 0xc8}, /* LATIN CAPITAL LETTER E WITH GRAVE */
123     { 'E', 0x0301, 0xc9}, /* LATIN CAPITAL LETTER E WITH ACUTE */
124     { 'E', 0x0302, 0xca}, /* LATIN CAPITAL LETTER E WITH CIRCUMFLEX */
125     { 'E', 0x0308, 0xcb}, /* LATIN CAPITAL LETTER E WITH DIAERESIS */
126     { 'I', 0x0300, 0xcc}, /* LATIN CAPITAL LETTER I WITH GRAVE */
127     { 'I', 0x0301, 0xcd}, /* LATIN CAPITAL LETTER I WITH ACUTE */
128     { 'I', 0x0302, 0xce}, /* LATIN CAPITAL LETTER I WITH CIRCUMFLEX */
129     { 'I', 0x0308, 0xcf}, /* LATIN CAPITAL LETTER I WITH DIAERESIS */
130     { 'N', 0x0303, 0xd1}, /* LATIN CAPITAL LETTER N WITH TILDE */
131     { 'O', 0x0300, 0xd2}, /* LATIN CAPITAL LETTER O WITH GRAVE */
132     { 'O', 0x0301, 0xd3}, /* LATIN CAPITAL LETTER O WITH ACUTE */
133     { 'O', 0x0302, 0xd4}, /* LATIN CAPITAL LETTER O WITH CIRCUMFLEX */
134     { 'O', 0x0303, 0xd5}, /* LATIN CAPITAL LETTER O WITH TILDE */
135     { 'O', 0x0308, 0xd6}, /* LATIN CAPITAL LETTER O WITH DIAERESIS */
136     /* omitted:    0xd7      MULTIPLICATION SIGN */
137     /* omitted:    0xd8      LATIN CAPITAL LETTER O WITH STROKE */
138     { 'U', 0x0300, 0xd9}, /* LATIN CAPITAL LETTER U WITH GRAVE */
139     { 'U', 0x0301, 0xda}, /* LATIN CAPITAL LETTER U WITH ACUTE */
140     { 'U', 0x0302, 0xdb}, /* LATIN CAPITAL LETTER U WITH CIRCUMFLEX */
141     { 'U', 0x0308, 0xdc}, /* LATIN CAPITAL LETTER U WITH DIAERESIS */
142     { 'Y', 0x0301, 0xdd}, /* LATIN CAPITAL LETTER Y WITH ACUTE */
143     /* omitted:    0xde      LATIN CAPITAL LETTER THORN */
144     /* omitted:    0xdf      LATIN SMALL LETTER SHARP S */
145     { 'a', 0x0300, 0xe0}, /* LATIN SMALL LETTER A WITH GRAVE */
146     { 'a', 0x0301, 0xe1}, /* LATIN SMALL LETTER A WITH ACUTE */
147     { 'a', 0x0302, 0xe2}, /* LATIN SMALL LETTER A WITH CIRCUMFLEX */
148     { 'a', 0x0303, 0xe3}, /* LATIN SMALL LETTER A WITH TILDE */
149     { 'a', 0x0308, 0xe4}, /* LATIN SMALL LETTER A WITH DIAERESIS */
150     { 'a', 0x030a, 0xe5}, /* LATIN SMALL LETTER A WITH RING ABOVE */
151     /* omitted:    0xe6      LATIN SMALL LETTER AE */
152     { 'c', 0x0327, 0xe7}, /* LATIN SMALL LETTER C WITH CEDILLA */
153     { 'e', 0x0300, 0xe8}, /* LATIN SMALL LETTER E WITH GRAVE */
154     { 'e', 0x0301, 0xe9}, /* LATIN SMALL LETTER E WITH ACUTE */
155     { 'e', 0x0302, 0xea}, /* LATIN SMALL LETTER E WITH CIRCUMFLEX */
156     { 'e', 0x0308, 0xeb}, /* LATIN SMALL LETTER E WITH DIAERESIS */
157     { 'i', 0x0300, 0xec}, /* LATIN SMALL LETTER I WITH GRAVE */
158     { 'i', 0x0301, 0xed}, /* LATIN SMALL LETTER I WITH ACUTE */
159     { 'i', 0x0302, 0xee}, /* LATIN SMALL LETTER I WITH CIRCUMFLEX */
160     { 'i', 0x0308, 0xef}, /* LATIN SMALL LETTER I WITH DIAERESIS */
161     /* omitted:    0xf0      LATIN SMALL LETTER ETH */
162     { 'n', 0x0303, 0xf1}, /* LATIN SMALL LETTER N WITH TILDE */
163     { 'o', 0x0300, 0xf2}, /* LATIN SMALL LETTER O WITH GRAVE */
164     { 'o', 0x0301, 0xf3}, /* LATIN SMALL LETTER O WITH ACUTE */
165     { 'o', 0x0302, 0xf4}, /* LATIN SMALL LETTER O WITH CIRCUMFLEX */
166     { 'o', 0x0303, 0xf5}, /* LATIN SMALL LETTER O WITH TILDE */
167     { 'o', 0x0308, 0xf6}, /* LATIN SMALL LETTER O WITH DIAERESIS */
168     /* omitted:    0xf7      DIVISION SIGN */
169     /* omitted:    0xf8      LATIN SMALL LETTER O WITH STROKE */
170     { 'u', 0x0300, 0xf9}, /* LATIN SMALL LETTER U WITH GRAVE */
171     { 'u', 0x0301, 0xfa}, /* LATIN SMALL LETTER U WITH ACUTE */
172     { 'u', 0x0302, 0xfb}, /* LATIN SMALL LETTER U WITH CIRCUMFLEX */
173     { 'u', 0x0308, 0xfc}, /* LATIN SMALL LETTER U WITH DIAERESIS */
174     { 'y', 0x0301, 0xfd}, /* LATIN SMALL LETTER Y WITH ACUTE */
175     /* omitted:    0xfe      LATIN SMALL LETTER THORN */
176     { 'y', 0x0308, 0xff}, /* LATIN SMALL LETTER Y WITH DIAERESIS */
177     
178     { 0, 0, 0}
179 };
180
181 static unsigned long yaz_read_ISO8859_1 (yaz_iconv_t cd, unsigned char *inp,
182                                          size_t inbytesleft, size_t *no_read)
183 {
184     unsigned long x = inp[0];
185     *no_read = 1;
186     return x;
187 }
188
189
190 static size_t yaz_init_UTF8 (yaz_iconv_t cd, unsigned char *inp,
191                              size_t inbytesleft, size_t *no_read)
192 {
193     if (inp[0] != 0xef)
194     {
195         *no_read = 0;
196         return 0;
197     }
198     if (inbytesleft < 3)
199     {
200         cd->my_errno = YAZ_ICONV_EINVAL;
201         return (size_t) -1;
202     }
203     if (inp[1] != 0xbb && inp[2] == 0xbf)
204         *no_read = 3;
205     else
206         *no_read = 0;
207     return 0;
208 }
209
210 unsigned long yaz_read_UTF8_char(unsigned char *inp,
211                                  size_t inbytesleft, size_t *no_read,
212                                  int *error)
213 {
214     unsigned long x = 0;
215
216     if (inp[0] <= 0x7f)
217     {
218         x = inp[0];
219         *no_read = 1;
220     }
221     else if (inp[0] <= 0xbf || inp[0] >= 0xfe)
222     {
223         *no_read = 0;
224         *error = YAZ_ICONV_EILSEQ;
225     }
226     else if (inp[0] <= 0xdf && inbytesleft >= 2)
227     {
228         x = ((inp[0] & 0x1f) << 6) | (inp[1] & 0x3f);
229         if (x >= 0x80)
230             *no_read = 2;
231         else
232         {
233             *no_read = 0;
234             *error = YAZ_ICONV_EILSEQ;
235         }
236     }
237     else if (inp[0] <= 0xef && inbytesleft >= 3)
238     {
239         x = ((inp[0] & 0x0f) << 12) | ((inp[1] & 0x3f) << 6) |
240             (inp[2] & 0x3f);
241         if (x >= 0x800)
242             *no_read = 3;
243         else
244         {
245             *no_read = 0;
246             *error = YAZ_ICONV_EILSEQ;
247         }
248     }
249     else if (inp[0] <= 0xf7 && inbytesleft >= 4)
250     {
251         x =  ((inp[0] & 0x07) << 18) | ((inp[1] & 0x3f) << 12) |
252             ((inp[2] & 0x3f) << 6) | (inp[3] & 0x3f);
253         if (x >= 0x10000)
254             *no_read = 4;
255         else
256         {
257             *no_read = 0;
258             *error = YAZ_ICONV_EILSEQ;
259         }
260     }
261     else if (inp[0] <= 0xfb && inbytesleft >= 5)
262     {
263         x =  ((inp[0] & 0x03) << 24) | ((inp[1] & 0x3f) << 18) |
264             ((inp[2] & 0x3f) << 12) | ((inp[3] & 0x3f) << 6) |
265             (inp[4] & 0x3f);
266         if (x >= 0x200000)
267             *no_read = 5;
268         else
269         {
270             *no_read = 0;
271             *error = YAZ_ICONV_EILSEQ;
272         }
273     }
274     else if (inp[0] <= 0xfd && inbytesleft >= 6)
275     {
276         x =  ((inp[0] & 0x01) << 30) | ((inp[1] & 0x3f) << 24) |
277             ((inp[2] & 0x3f) << 18) | ((inp[3] & 0x3f) << 12) |
278             ((inp[4] & 0x3f) << 6) | (inp[5] & 0x3f);
279         if (x >= 0x4000000)
280             *no_read = 6;
281         else
282         {
283             *no_read = 0;
284             *error = YAZ_ICONV_EILSEQ;
285         }
286     }
287     else
288     {
289         *no_read = 0;
290         *error = YAZ_ICONV_EINVAL;
291     }
292     return x;
293 }
294
295 static unsigned long yaz_read_UTF8 (yaz_iconv_t cd, unsigned char *inp,
296                                     size_t inbytesleft, size_t *no_read)
297 {
298     return yaz_read_UTF8_char(inp, inbytesleft, no_read, &cd->my_errno);
299 }
300
301 static unsigned long yaz_read_UCS4 (yaz_iconv_t cd, unsigned char *inp,
302                                     size_t inbytesleft, size_t *no_read)
303 {
304     unsigned long x = 0;
305     
306     if (inbytesleft < 4)
307     {
308         cd->my_errno = YAZ_ICONV_EINVAL; /* incomplete input */
309         *no_read = 0;
310     }
311     else
312     {
313         x = (inp[0]<<24) | (inp[1]<<16) | (inp[2]<<8) | inp[3];
314         *no_read = 4;
315     }
316     return x;
317 }
318
319 static unsigned long yaz_read_UCS4LE (yaz_iconv_t cd, unsigned char *inp,
320                                       size_t inbytesleft, size_t *no_read)
321 {
322     unsigned long x = 0;
323     
324     if (inbytesleft < 4)
325     {
326         cd->my_errno = YAZ_ICONV_EINVAL; /* incomplete input */
327         *no_read = 0;
328     }
329     else
330     {
331         x = (inp[3]<<24) | (inp[2]<<16) | (inp[1]<<8) | inp[0];
332         *no_read = 4;
333     }
334     return x;
335 }
336
337 #if HAVE_WCHAR_H
338 static unsigned long yaz_read_wchar_t (yaz_iconv_t cd, unsigned char *inp,
339                                        size_t inbytesleft, size_t *no_read)
340 {
341     unsigned long x = 0;
342     
343     if (inbytesleft < sizeof(wchar_t))
344     {
345         cd->my_errno = YAZ_ICONV_EINVAL; /* incomplete input */
346         *no_read = 0;
347     }
348     else
349     {
350         wchar_t wch;
351         memcpy (&wch, inp, sizeof(wch));
352         x = wch;
353         *no_read = sizeof(wch);
354     }
355     return x;
356 }
357 #endif
358
359
360 static unsigned long yaz_read_marc8_comb (yaz_iconv_t cd, unsigned char *inp,
361                                           size_t inbytesleft, size_t *no_read,
362                                           int *comb);
363
364 static unsigned long yaz_read_marc8 (yaz_iconv_t cd, unsigned char *inp,
365                                      size_t inbytesleft, size_t *no_read)
366 {
367     unsigned long x;
368     if (cd->comb_offset < cd->comb_size)
369     {
370         *no_read = cd->comb_no_read[cd->comb_offset];
371         x = cd->comb_x[cd->comb_offset];
372
373         /* special case for double-diacritic combining characters, 
374            INVERTED BREVE and DOUBLE TILDE.
375            We'll increment the no_read counter by 1, since we want to skip over
376            the processing of the closing ligature character
377         */
378         /* this code is no longer necessary.. our handlers code in
379            yaz_marc8_?_conv (generated by charconv.tcl) now returns
380            0 and no_read=1 when a sequence does not match the input.
381            The SECOND HALFs in codetables.xml produces a non-existant
382            entry in the conversion trie.. Hence when met, the input byte is
383            skipped as it should (in yaz_iconv)
384         */
385 #if 0
386         if (x == 0x0361 || x == 0x0360)
387             *no_read += 1;
388 #endif
389         cd->comb_offset++;
390         return x;
391     }
392
393     cd->comb_offset = 0;
394     for (cd->comb_size = 0; cd->comb_size < 8; cd->comb_size++)
395     {
396         int comb = 0;
397         if (inbytesleft == 0 && cd->comb_size)
398         {
399             cd->my_errno = YAZ_ICONV_EINVAL;
400             x = 0;
401             *no_read = 0;
402             break;
403         }
404         x = yaz_read_marc8_comb(cd, inp, inbytesleft, no_read, &comb);
405         if (!comb || !x)
406             break;
407         cd->comb_x[cd->comb_size] = x;
408         cd->comb_no_read[cd->comb_size] = *no_read;
409         inp += *no_read;
410         inbytesleft = inbytesleft - *no_read;
411     }
412     return x;
413 }
414
415 static unsigned long yaz_read_marc8s(yaz_iconv_t cd, unsigned char *inp,
416                                      size_t inbytesleft, size_t *no_read)
417 {
418     unsigned long x = yaz_read_marc8(cd, inp, inbytesleft, no_read);
419     if (x && cd->comb_size == 1)
420     {
421         /* For MARC8s we try to get a Latin-1 page code out of it */
422         int i;
423         for (i = 0; latin1_comb[i].x1; i++)
424             if (cd->comb_x[0] == latin1_comb[i].x2 && x == latin1_comb[i].x1)
425             {
426                 *no_read += cd->comb_no_read[0];
427                 cd->comb_size = 0;
428                 x = latin1_comb[i].y;
429                 break;
430             }
431     }
432     return x;
433 }
434
435 static unsigned long yaz_read_marc8_comb(yaz_iconv_t cd, unsigned char *inp,
436                                          size_t inbytesleft, size_t *no_read,
437                                          int *comb)
438 {
439     *no_read = 0;
440     while(inbytesleft >= 1 && inp[0] == 27)
441     {
442         size_t inbytesleft0 = inbytesleft;
443         inp++;
444         inbytesleft--;
445         while(inbytesleft > 0 && strchr("(,$!)-", *inp))
446         {
447             inbytesleft--;
448             inp++;
449         }
450         if (inbytesleft <= 0)
451         {
452             *no_read = 0;
453             cd->my_errno = YAZ_ICONV_EINVAL;
454             return 0;
455         }
456         cd->marc8_esc_mode = *inp++;
457         inbytesleft--;
458         (*no_read) += inbytesleft0 - inbytesleft;
459     }
460     if (inbytesleft <= 0)
461         return 0;
462     else
463     {
464         unsigned long x;
465         size_t no_read_sub = 0;
466         *comb = 0;
467
468         switch(cd->marc8_esc_mode)
469         {
470         case 'B':  /* Basic ASCII */
471         case 'E':  /* ANSEL */
472         case 's':  /* ASCII */
473             x = yaz_marc8_1_conv(inp, inbytesleft, &no_read_sub, comb);
474             break;
475         case 'g':  /* Greek */
476             x = yaz_marc8_2_conv(inp, inbytesleft, &no_read_sub, comb);
477             break;
478         case 'b':  /* Subscripts */
479             x = yaz_marc8_3_conv(inp, inbytesleft, &no_read_sub, comb);
480             break;
481         case 'p':  /* Superscripts */
482             x = yaz_marc8_4_conv(inp, inbytesleft, &no_read_sub, comb);
483             break;
484         case '2':  /* Basic Hebrew */
485             x = yaz_marc8_5_conv(inp, inbytesleft, &no_read_sub, comb);
486             break;
487         case 'N':  /* Basic Cyrillic */
488         case 'Q':  /* Extended Cyrillic */
489             x = yaz_marc8_6_conv(inp, inbytesleft, &no_read_sub, comb);
490             break;
491         case '3':  /* Basic Arabic */
492         case '4':  /* Extended Arabic */
493             x = yaz_marc8_7_conv(inp, inbytesleft, &no_read_sub, comb);
494             break;
495         case 'S':  /* Greek */
496             x = yaz_marc8_8_conv(inp, inbytesleft, &no_read_sub, comb);
497             break;
498         case '1':  /* Chinese, Japanese, Korean (EACC) */
499             x = yaz_marc8_9_conv(inp, inbytesleft, &no_read_sub, comb);
500             break;
501         default:
502             *no_read = 0;
503             cd->my_errno = YAZ_ICONV_EILSEQ;
504             return 0;
505         }
506         *no_read += no_read_sub;
507         return x;
508     }
509 }
510
511 static size_t yaz_write_UTF8(yaz_iconv_t cd, unsigned long x,
512                              char **outbuf, size_t *outbytesleft,
513                              int last)
514 {
515     return yaz_write_UTF8_char(x, outbuf, outbytesleft, &cd->my_errno);
516 }
517
518 size_t yaz_write_UTF8_char(unsigned long x,
519                            char **outbuf, size_t *outbytesleft,
520                            int *error)
521 {
522     unsigned char *outp = (unsigned char *) *outbuf;
523
524     if (x <= 0x7f && *outbytesleft >= 1)
525     {
526         *outp++ = (unsigned char) x;
527         (*outbytesleft)--;
528     } 
529     else if (x <= 0x7ff && *outbytesleft >= 2)
530     {
531         *outp++ = (unsigned char) ((x >> 6) | 0xc0);
532         *outp++ = (unsigned char) ((x & 0x3f) | 0x80);
533         (*outbytesleft) -= 2;
534     }
535     else if (x <= 0xffff && *outbytesleft >= 3)
536     {
537         *outp++ = (unsigned char) ((x >> 12) | 0xe0);
538         *outp++ = (unsigned char) (((x >> 6) & 0x3f) | 0x80);
539         *outp++ = (unsigned char) ((x & 0x3f) | 0x80);
540         (*outbytesleft) -= 3;
541     }
542     else if (x <= 0x1fffff && *outbytesleft >= 4)
543     {
544         *outp++ = (unsigned char) ((x >> 18) | 0xf0);
545         *outp++ = (unsigned char) (((x >> 12) & 0x3f) | 0x80);
546         *outp++ = (unsigned char) (((x >> 6)  & 0x3f) | 0x80);
547         *outp++ = (unsigned char) ((x & 0x3f) | 0x80);
548         (*outbytesleft) -= 4;
549     }
550     else if (x <= 0x3ffffff && *outbytesleft >= 5)
551     {
552         *outp++ = (unsigned char) ((x >> 24) | 0xf8);
553         *outp++ = (unsigned char) (((x >> 18) & 0x3f) | 0x80);
554         *outp++ = (unsigned char) (((x >> 12) & 0x3f) | 0x80);
555         *outp++ = (unsigned char) (((x >> 6)  & 0x3f) | 0x80);
556         *outp++ = (unsigned char) ((x & 0x3f) | 0x80);
557         (*outbytesleft) -= 5;
558     }
559     else if (*outbytesleft >= 6)
560     {
561         *outp++ = (unsigned char) ((x >> 30) | 0xfc);
562         *outp++ = (unsigned char) (((x >> 24) & 0x3f) | 0x80);
563         *outp++ = (unsigned char) (((x >> 18) & 0x3f) | 0x80);
564         *outp++ = (unsigned char) (((x >> 12) & 0x3f) | 0x80);
565         *outp++ = (unsigned char) (((x >> 6)  & 0x3f) | 0x80);
566         *outp++ = (unsigned char) ((x & 0x3f) | 0x80);
567         (*outbytesleft) -= 6;
568     }
569     else 
570     {
571         *error = YAZ_ICONV_E2BIG;  /* not room for output */
572         return (size_t)(-1);
573     }
574     *outbuf = (char *) outp;
575     return 0;
576 }
577
578
579 static size_t yaz_write_ISO8859_1 (yaz_iconv_t cd, unsigned long x,
580                                    char **outbuf, size_t *outbytesleft,
581                                    int last)
582 {
583     /* list of two char unicode sequence that, when combined, are
584        equivalent to single unicode chars that can be represented in
585        ISO-8859-1/Latin-1.
586        Regular iconv on Linux at least does not seem to convert these,
587        but since MARC-8 to UTF-8 generates these composed sequence
588        we get a better chance of a successful MARC-8 -> ISO-8859-1
589        conversion */
590     unsigned char *outp = (unsigned char *) *outbuf;
591
592     if (cd->compose_char)
593     {
594         int i;
595         for (i = 0; latin1_comb[i].x1; i++)
596             if (cd->compose_char == latin1_comb[i].x1 && x == latin1_comb[i].x2)
597             {
598                 x = latin1_comb[i].y;
599                 break;
600             }
601         if (*outbytesleft < 1)
602         {  /* no room. Retain compose_char and bail out */
603             cd->my_errno = YAZ_ICONV_E2BIG;
604             return (size_t)(-1);
605         }
606         if (!latin1_comb[i].x1) 
607         {   /* not found. Just write compose_char */
608             *outp++ = (unsigned char) cd->compose_char;
609             (*outbytesleft)--;
610             *outbuf = (char *) outp;
611         }
612         /* compose_char used so reset it. x now holds current char */
613         cd->compose_char = 0;
614     }
615
616     if (!last && x > 32 && x < 127 && cd->compose_char == 0)
617     {
618         cd->compose_char = x;
619         return 0;
620     }
621     else if (x > 255 || x < 1)
622     {
623         cd->my_errno = YAZ_ICONV_EILSEQ;
624         return (size_t) -1;
625     }
626     else if (*outbytesleft < 1)
627     {
628         cd->my_errno = YAZ_ICONV_E2BIG;
629         return (size_t)(-1);
630     }
631     *outp++ = (unsigned char) x;
632     (*outbytesleft)--;
633     *outbuf = (char *) outp;
634     return 0;
635 }
636
637
638 static size_t yaz_write_UCS4 (yaz_iconv_t cd, unsigned long x,
639                               char **outbuf, size_t *outbytesleft,
640                               int last)
641 {
642     unsigned char *outp = (unsigned char *) *outbuf;
643     if (*outbytesleft >= 4)
644     {
645         *outp++ = (unsigned char) (x>>24);
646         *outp++ = (unsigned char) (x>>16);
647         *outp++ = (unsigned char) (x>>8);
648         *outp++ = (unsigned char) x;
649         (*outbytesleft) -= 4;
650     }
651     else
652     {
653         cd->my_errno = YAZ_ICONV_E2BIG;
654         return (size_t)(-1);
655     }
656     *outbuf = (char *) outp;
657     return 0;
658 }
659
660 static size_t yaz_write_UCS4LE (yaz_iconv_t cd, unsigned long x,
661                                 char **outbuf, size_t *outbytesleft,
662                                 int last)
663 {
664     unsigned char *outp = (unsigned char *) *outbuf;
665     if (*outbytesleft >= 4)
666     {
667         *outp++ = (unsigned char) x;
668         *outp++ = (unsigned char) (x>>8);
669         *outp++ = (unsigned char) (x>>16);
670         *outp++ = (unsigned char) (x>>24);
671         (*outbytesleft) -= 4;
672     }
673     else
674     {
675         cd->my_errno = YAZ_ICONV_E2BIG;
676         return (size_t)(-1);
677     }
678     *outbuf = (char *) outp;
679     return 0;
680 }
681
682 static unsigned long lookup_marc8(yaz_iconv_t cd,
683                                   unsigned long x, int *comb,
684                                   const char **page_chr)
685 {
686     char utf8_buf[7];
687     char *utf8_outbuf = utf8_buf;
688     size_t utf8_outbytesleft = sizeof(utf8_buf)-1, r;
689
690     r = yaz_write_UTF8(cd, x, &utf8_outbuf, &utf8_outbytesleft, 0);
691     if (r == (size_t)(-1))
692     {
693         cd->my_errno = YAZ_ICONV_EILSEQ;
694         return 0;
695     }
696     else
697     {
698         unsigned char *inp;
699         size_t inbytesleft, no_read_sub = 0;
700         unsigned long x;
701
702         *utf8_outbuf = '\0';        
703         inp = (unsigned char *) utf8_buf;
704         inbytesleft = strlen(utf8_buf);
705         
706         x = yaz_marc8r_1_conv(inp, inbytesleft, &no_read_sub, comb);
707         if (x)
708         {
709             *page_chr = "\033(B";
710             return x;
711         }
712         x = yaz_marc8r_2_conv(inp, inbytesleft, &no_read_sub, comb);
713         if (x)
714         {
715             *page_chr = "\033g";
716             return x;
717         }
718         x = yaz_marc8r_3_conv(inp, inbytesleft, &no_read_sub, comb);
719         if (x)
720         {
721             *page_chr = "\033b";
722             return x;
723         }
724         x = yaz_marc8r_4_conv(inp, inbytesleft, &no_read_sub, comb);
725         if (x)
726         {
727             *page_chr = "\033p";
728             return x;
729         }
730         x = yaz_marc8r_5_conv(inp, inbytesleft, &no_read_sub, comb);
731         if (x)
732         {
733             *page_chr = "\033(2";
734             return x;
735         }
736         x = yaz_marc8r_6_conv(inp, inbytesleft, &no_read_sub, comb);
737         if (x)
738         {
739             *page_chr = "\033(N";
740             return x;
741         }
742         x = yaz_marc8r_7_conv(inp, inbytesleft, &no_read_sub, comb);
743         if (x)
744         {
745             *page_chr = "\033(3";
746             return x;
747         }
748         x = yaz_marc8r_8_conv(inp, inbytesleft, &no_read_sub, comb);
749         if (x)
750         {
751             *page_chr = "\033(S";
752             return x;
753         }
754         x = yaz_marc8r_9_conv(inp, inbytesleft, &no_read_sub, comb);
755         if (x)
756         {
757             *page_chr = "\033$1";
758             return x;
759         }
760         cd->my_errno = YAZ_ICONV_EILSEQ;
761         return x;
762     }
763 }
764
765 static size_t flush_combos(yaz_iconv_t cd,
766                            char **outbuf, size_t *outbytesleft)
767 {
768     unsigned long y = cd->write_marc8_last;
769     unsigned char byte;
770     char out_buf[10];
771     size_t i, out_no = 0;
772
773     if (!y)
774         return 0;
775
776     byte = (unsigned char )((y>>16) & 0xff);
777     if (byte)
778         out_buf[out_no++] = byte;
779     byte = (unsigned char)((y>>8) & 0xff);
780     if (byte)
781         out_buf[out_no++] = byte;
782     byte = (unsigned char )(y & 0xff);
783     if (byte)
784         out_buf[out_no++] = byte;
785
786     if (out_no + cd->write_marc8_comb_no + 1 > *outbytesleft)
787     {
788         cd->my_errno = YAZ_ICONV_E2BIG;
789         return (size_t) (-1);
790     }
791
792     for (i = 0; i < cd->write_marc8_comb_no; i++)
793     {
794         /* all MARC-8 combined characters are simple bytes */
795         byte = (unsigned char )(cd->write_marc8_comb_ch[i]);
796         *(*outbuf)++ = byte;
797         (*outbytesleft)--;
798     }
799     memcpy(*outbuf, out_buf, out_no);
800     *outbuf += out_no;
801     (*outbytesleft) -= out_no;
802     if (cd->write_marc8_second_half_char)
803     {
804         *(*outbuf)++ = cd->write_marc8_second_half_char;
805         (*outbytesleft)--;
806     }        
807
808     cd->write_marc8_last = 0;
809     cd->write_marc8_comb_no = 0;
810     cd->write_marc8_second_half_char = 0;
811     return 0;
812 }
813
814 static size_t yaz_write_marc8_2(yaz_iconv_t cd, unsigned long x,
815                                 char **outbuf, size_t *outbytesleft,
816                                 int last)
817 {
818     int comb = 0;
819     const char *page_chr = 0;
820     unsigned long y = lookup_marc8(cd, x, &comb, &page_chr);
821
822     if (!y)
823         return (size_t) (-1);
824
825     if (comb)
826     {
827         if (x == 0x0361)
828             cd->write_marc8_second_half_char = 0xEC;
829         else if (x == 0x0360)
830             cd->write_marc8_second_half_char = 0xFB;
831
832         if (cd->write_marc8_comb_no < 6)
833             cd->write_marc8_comb_ch[cd->write_marc8_comb_no++] = y;
834     }
835     else
836     {
837         size_t r = flush_combos(cd, outbuf, outbytesleft);
838         const char *old_page_chr = cd->write_marc8_page_chr;
839         if (r)
840             return r;
841         if (strcmp(page_chr, old_page_chr))
842         {
843             size_t plen = 0;
844             const char *page_out = page_chr;
845
846             if (*outbytesleft < 8)
847             {
848                 cd->my_errno = YAZ_ICONV_E2BIG;
849                 
850                 return (size_t) (-1);
851             }
852             cd->write_marc8_page_chr = page_chr;
853
854             if (!strcmp(old_page_chr, "\033p") 
855                 || !strcmp(old_page_chr, "\033g")
856                 || !strcmp(old_page_chr, "\033b"))
857             {
858                 /* Technique 1 leave */
859                 page_out = "\033s";
860                 if (strcmp(page_chr, "\033(B")) /* Not going ASCII page? */
861                 {
862                     /* Must leave script + enter new page */
863                     plen = strlen(page_out);
864                     memcpy(*outbuf, page_out, plen);
865                     (*outbuf) += plen;
866                     (*outbytesleft) -= plen;
867                     page_out = page_chr;
868                 }
869             }
870             plen = strlen(page_out);
871             memcpy(*outbuf, page_out, plen);
872             (*outbuf) += plen;
873             (*outbytesleft) -= plen;
874         }
875         cd->write_marc8_last = y;
876     }
877     if (last)
878     {
879         size_t r = flush_combos(cd, outbuf, outbytesleft);
880         if (r)
881         {
882             if (comb)
883                 cd->write_marc8_comb_no--;
884             else
885                 cd->write_marc8_last = 0;
886             return r;
887         }
888     }
889     return 0;
890 }
891
892 static size_t yaz_flush_marc8(yaz_iconv_t cd,
893                               char **outbuf, size_t *outbytesleft)
894 {
895     if (strcmp(cd->write_marc8_page_chr, "\033(B"))
896     {
897         if (*outbytesleft < 3)
898         {
899             cd->my_errno = YAZ_ICONV_E2BIG;
900             return (size_t) (-1);
901         }
902         memcpy(*outbuf, "\033(B", 3);
903         (*outbuf) += 3;
904         *outbytesleft -= 3;
905     }
906     return 0;
907 }
908
909 static size_t yaz_write_marc8(yaz_iconv_t cd, unsigned long x,
910                               char **outbuf, size_t *outbytesleft,
911                               int last)
912 {
913     int i;
914     for (i = 0; latin1_comb[i].x1; i++)
915     {
916         if (x == latin1_comb[i].y)
917         {
918             size_t r ;
919             /* save the output pointers .. */
920             char *outbuf0 = *outbuf;
921             size_t outbytesleft0 = *outbytesleft;
922             int last_ch = cd->write_marc8_last;
923
924             r = yaz_write_marc8_2(cd, latin1_comb[i].x1,
925                                   outbuf, outbytesleft, 0);
926             if (r)
927                 return r;
928             r = yaz_write_marc8_2(cd, latin1_comb[i].x2,
929                                   outbuf, outbytesleft, last);
930             if (r && cd->my_errno == YAZ_ICONV_E2BIG)
931             {
932                 /* not enough room. reset output to original values */
933                 *outbuf = outbuf0;
934                 *outbytesleft = outbytesleft0;
935                 cd->write_marc8_last = last_ch;
936             }
937             return r;
938         }
939     }
940     return yaz_write_marc8_2(cd, x, outbuf, outbytesleft, last);
941 }
942
943
944 #if HAVE_WCHAR_H
945 static size_t yaz_write_wchar_t (yaz_iconv_t cd, unsigned long x,
946                                  char **outbuf, size_t *outbytesleft,
947                                  int last)
948 {
949     unsigned char *outp = (unsigned char *) *outbuf;
950
951     if (*outbytesleft >= sizeof(wchar_t))
952     {
953         wchar_t wch = x;
954         memcpy(outp, &wch, sizeof(wch));
955         outp += sizeof(wch);
956         (*outbytesleft) -= sizeof(wch);
957     }
958     else
959     {
960         cd->my_errno = YAZ_ICONV_E2BIG;
961         return (size_t)(-1);
962     }
963     *outbuf = (char *) outp;
964     return 0;
965 }
966 #endif
967
968 int yaz_iconv_isbuiltin(yaz_iconv_t cd)
969 {
970     return cd->read_handle && cd->write_handle;
971 }
972
973 yaz_iconv_t yaz_iconv_open (const char *tocode, const char *fromcode)
974 {
975     yaz_iconv_t cd = (yaz_iconv_t) xmalloc (sizeof(*cd));
976
977     cd->write_handle = 0;
978     cd->read_handle = 0;
979     cd->init_handle = 0;
980     cd->flush_handle = 0;
981     cd->my_errno = YAZ_ICONV_UNKNOWN;
982
983     /* a useful hack: if fromcode has leading @,
984        the library not use YAZ's own conversions .. */
985     if (fromcode[0] == '@')
986         fromcode++;
987     else
988     {
989         if (!yaz_matchstr(fromcode, "UTF8"))
990         {
991             cd->read_handle = yaz_read_UTF8;
992             cd->init_handle = yaz_init_UTF8;
993         }
994         else if (!yaz_matchstr(fromcode, "ISO88591"))
995             cd->read_handle = yaz_read_ISO8859_1;
996         else if (!yaz_matchstr(fromcode, "UCS4"))
997             cd->read_handle = yaz_read_UCS4;
998         else if (!yaz_matchstr(fromcode, "UCS4LE"))
999             cd->read_handle = yaz_read_UCS4LE;
1000         else if (!yaz_matchstr(fromcode, "MARC8"))
1001             cd->read_handle = yaz_read_marc8;
1002         else if (!yaz_matchstr(fromcode, "MARC8s"))
1003             cd->read_handle = yaz_read_marc8s;
1004 #if HAVE_WCHAR_H
1005         else if (!yaz_matchstr(fromcode, "WCHAR_T"))
1006             cd->read_handle = yaz_read_wchar_t;
1007 #endif
1008         
1009         if (!yaz_matchstr(tocode, "UTF8"))
1010             cd->write_handle = yaz_write_UTF8;
1011         else if (!yaz_matchstr(tocode, "ISO88591"))
1012             cd->write_handle = yaz_write_ISO8859_1;
1013         else if (!yaz_matchstr (tocode, "UCS4"))
1014             cd->write_handle = yaz_write_UCS4;
1015         else if (!yaz_matchstr(tocode, "UCS4LE"))
1016             cd->write_handle = yaz_write_UCS4LE;
1017         else if (!yaz_matchstr(tocode, "MARC8"))
1018         {
1019             cd->write_handle = yaz_write_marc8;
1020             cd->flush_handle = yaz_flush_marc8;
1021         }
1022         else if (!yaz_matchstr(tocode, "MARC8s"))
1023         {
1024             cd->write_handle = yaz_write_marc8;
1025             cd->flush_handle = yaz_flush_marc8;
1026         }
1027 #if HAVE_WCHAR_H
1028         else if (!yaz_matchstr(tocode, "WCHAR_T"))
1029             cd->write_handle = yaz_write_wchar_t;
1030 #endif
1031     }
1032 #if HAVE_ICONV_H
1033     cd->iconv_cd = 0;
1034     if (!cd->read_handle || !cd->write_handle)
1035     {
1036         cd->iconv_cd = iconv_open (tocode, fromcode);
1037         if (cd->iconv_cd == (iconv_t) (-1))
1038         {
1039             xfree (cd);
1040             return 0;
1041         }
1042     }
1043 #else
1044     if (!cd->read_handle || !cd->write_handle)
1045     {
1046         xfree (cd);
1047         return 0;
1048     }
1049 #endif
1050     cd->init_flag = 1;
1051     return cd;
1052 }
1053
1054 size_t yaz_iconv(yaz_iconv_t cd, char **inbuf, size_t *inbytesleft,
1055                  char **outbuf, size_t *outbytesleft)
1056 {
1057     char *inbuf0 = 0;
1058     size_t r = 0;
1059
1060 #if HAVE_ICONV_H
1061     if (cd->iconv_cd)
1062     {
1063         size_t r =
1064             iconv(cd->iconv_cd, inbuf, inbytesleft, outbuf, outbytesleft);
1065         if (r == (size_t)(-1))
1066         {
1067             switch (yaz_errno())
1068             {
1069             case E2BIG:
1070                 cd->my_errno = YAZ_ICONV_E2BIG;
1071                 break;
1072             case EINVAL:
1073                 cd->my_errno = YAZ_ICONV_EINVAL;
1074                 break;
1075             case EILSEQ:
1076                 cd->my_errno = YAZ_ICONV_EILSEQ;
1077                 break;
1078             default:
1079                 cd->my_errno = YAZ_ICONV_UNKNOWN;
1080             }
1081         }
1082         return r;
1083     }
1084 #endif
1085
1086     if (inbuf)
1087         inbuf0 = *inbuf;
1088
1089     if (cd->init_flag)
1090     {
1091         cd->my_errno = YAZ_ICONV_UNKNOWN;
1092         cd->marc8_esc_mode = 'B';
1093         
1094         cd->comb_offset = cd->comb_size = 0;
1095         cd->compose_char = 0;
1096         
1097         cd->write_marc8_comb_no = 0;
1098         cd->write_marc8_second_half_char = 0;
1099         cd->write_marc8_last = 0;
1100         cd->write_marc8_page_chr = "\033(B";
1101         
1102         cd->unget_x = 0;
1103         cd->no_read_x = 0;
1104     }
1105
1106     if (cd->init_flag)
1107     {
1108         if (cd->init_handle && inbuf && *inbuf)
1109         {
1110             size_t no_read = 0;
1111             size_t r = (cd->init_handle)(cd, (unsigned char *) *inbuf,
1112                                          *inbytesleft, &no_read);
1113             if (r)
1114             {
1115                 if (cd->my_errno == YAZ_ICONV_EINVAL)
1116                     return r;
1117                 cd->init_flag = 0;
1118                 return r;
1119             }
1120             *inbytesleft -= no_read;
1121             *inbuf += no_read;
1122         }
1123     }
1124     cd->init_flag = 0;
1125
1126     while (1)
1127     {
1128         unsigned long x;
1129         size_t no_read;
1130
1131         if (cd->unget_x)
1132         {
1133             x = cd->unget_x;
1134             no_read = cd->no_read_x;
1135         }
1136         else if (inbuf && *inbuf)
1137         {
1138             if (*inbytesleft == 0)
1139             {
1140                 r = *inbuf - inbuf0;
1141                 break;
1142             }
1143             x = (cd->read_handle)(cd, (unsigned char *) *inbuf, *inbytesleft,
1144                                   &no_read);
1145             if (no_read == 0)
1146             {
1147                 r = (size_t)(-1);
1148                 break;
1149             }
1150         }
1151         else
1152         {
1153             r = 0;
1154             if (cd->flush_handle && outbuf && *outbuf)
1155                 r = (*cd->flush_handle)(cd, outbuf, outbytesleft);
1156             if (r == 0)
1157                 cd->init_flag = 1;
1158             break;
1159         }
1160         if (x)
1161         {
1162             r = (cd->write_handle)(cd, x, outbuf, outbytesleft,
1163                                    (*inbytesleft - no_read) == 0 ? 1 : 0);
1164             if (r)
1165             {
1166                 /* unable to write it. save it because read_handle cannot
1167                    rewind .. */
1168                 if (cd->my_errno == YAZ_ICONV_E2BIG)
1169                 {
1170                     cd->unget_x = x;
1171                     cd->no_read_x = no_read;
1172                     break;
1173                 }
1174             }
1175             cd->unget_x = 0;
1176         }
1177         *inbytesleft -= no_read;
1178         (*inbuf) += no_read;
1179     }
1180     return r;
1181 }
1182
1183 int yaz_iconv_error (yaz_iconv_t cd)
1184 {
1185     return cd->my_errno;
1186 }
1187
1188 int yaz_iconv_close (yaz_iconv_t cd)
1189 {
1190 #if HAVE_ICONV_H
1191     if (cd->iconv_cd)
1192         iconv_close (cd->iconv_cd);
1193 #endif
1194     xfree (cd);
1195     return 0;
1196 }
1197
1198 /*
1199  * Local variables:
1200  * c-basic-offset: 4
1201  * indent-tabs-mode: nil
1202  * End:
1203  * vim: shiftwidth=4 tabstop=8 expandtab
1204  */
1205