Fixed bug #258: marc-8 to utf8 fails for some composed sequences
[yaz-moved-to-github.git] / src / siconv.c
1 /*
2  * Copyright (C) 1995-2005, Index Data ApS
3  * See the file LICENSE for details.
4  *
5  * $Id: siconv.c,v 1.9 2005-02-01 21:06:37 adam Exp $
6  */
7 /**
8  * \file siconv.c
9  * \brief Implements simple ICONV
10  *
11  * This implements an interface similar to that of iconv and
12  * is used by YAZ to interface with iconv (if present).
13  * For systems where iconv is not present, this layer
14  * provides a few important conversion: UTF-8, MARC-8, Latin-1.
15  */
16
17 #if HAVE_CONFIG_H
18 #include <config.h>
19 #endif
20
21 #include <errno.h>
22 #include <string.h>
23 #include <ctype.h>
24 #if HAVE_WCHAR_H
25 #include <wchar.h>
26 #endif
27
28 #if HAVE_ICONV_H
29 #include <iconv.h>
30 #endif
31
32 #include <yaz/yaz-util.h>
33
34 unsigned long yaz_marc8_1_conv (unsigned char *inp, size_t inbytesleft,
35                               size_t *no_read, int *combining);
36 unsigned long yaz_marc8_2_conv (unsigned char *inp, size_t inbytesleft,
37                                 size_t *no_read, int *combining);
38 unsigned long yaz_marc8_3_conv (unsigned char *inp, size_t inbytesleft,
39                                 size_t *no_read, int *combining);
40 unsigned long yaz_marc8_4_conv (unsigned char *inp, size_t inbytesleft,
41                                 size_t *no_read, int *combining);
42 unsigned long yaz_marc8_5_conv (unsigned char *inp, size_t inbytesleft,
43                                 size_t *no_read, int *combining);
44 unsigned long yaz_marc8_6_conv (unsigned char *inp, size_t inbytesleft,
45                                 size_t *no_read, int *combining);
46 unsigned long yaz_marc8_7_conv (unsigned char *inp, size_t inbytesleft,
47                                 size_t *no_read, int *combining);
48 unsigned long yaz_marc8_8_conv (unsigned char *inp, size_t inbytesleft,
49                                 size_t *no_read, int *combining);
50 unsigned long yaz_marc8_9_conv (unsigned char *inp, size_t inbytesleft,
51                                 size_t *no_read, int *combining);
52     
53 struct yaz_iconv_struct {
54     int my_errno;
55     int init_flag;
56     size_t (*init_handle)(yaz_iconv_t cd, unsigned char *inbuf,
57                           size_t inbytesleft, size_t *no_read);
58     unsigned long (*read_handle)(yaz_iconv_t cd, unsigned char *inbuf,
59                                  size_t inbytesleft, size_t *no_read);
60     size_t (*write_handle)(yaz_iconv_t cd, unsigned long x,
61                            char **outbuf, size_t *outbytesleft);
62     int marc8_esc_mode;
63     int marc8_comb_x;
64     int marc8_comb_no_read;
65     size_t no_read_x;
66     unsigned unget_x;
67 #if HAVE_ICONV_H
68     iconv_t iconv_cd;
69 #endif
70 };
71
72 static unsigned long yaz_read_ISO8859_1 (yaz_iconv_t cd, unsigned char *inp,
73                                          size_t inbytesleft, size_t *no_read)
74 {
75     unsigned long x = inp[0];
76     *no_read = 1;
77     return x;
78 }
79
80 static size_t yaz_init_UTF8 (yaz_iconv_t cd, unsigned char *inp,
81                              size_t inbytesleft, size_t *no_read)
82 {
83     if (inp[0] != 0xef)
84     {
85         *no_read = 0;
86         return 0;
87     }
88     if (inbytesleft < 3)
89     {
90         cd->my_errno = YAZ_ICONV_EINVAL;
91         return (size_t) -1;
92     }
93     if (inp[1] != 0xbb || inp[2] != 0xbf)
94     {
95         cd->my_errno = YAZ_ICONV_EILSEQ;
96         return (size_t) -1;
97     }
98     *no_read = 3;
99     return 0;
100 }
101
102 static unsigned long yaz_read_UTF8 (yaz_iconv_t cd, unsigned char *inp,
103                                     size_t inbytesleft, size_t *no_read)
104 {
105     unsigned long x = 0;
106
107     if (inp[0] <= 0x7f)
108     {
109         x = inp[0];
110         *no_read = 1;
111     }
112     else if (inp[0] <= 0xbf || inp[0] >= 0xfe)
113     {
114         *no_read = 0;
115         cd->my_errno = YAZ_ICONV_EILSEQ;
116     }
117     else if (inp[0] <= 0xdf && inbytesleft >= 2)
118     {
119         x = ((inp[0] & 0x1f) << 6) | (inp[1] & 0x3f);
120         if (x >= 0x80)
121             *no_read = 2;
122         else
123         {
124             *no_read = 0;
125             cd->my_errno = YAZ_ICONV_EILSEQ;
126         }
127     }
128     else if (inp[0] <= 0xef && inbytesleft >= 3)
129     {
130         x = ((inp[0] & 0x0f) << 12) | ((inp[1] & 0x3f) << 6) |
131             (inp[1] & 0x3f);
132         if (x >= 0x800)
133             *no_read = 3;
134         else
135         {
136             *no_read = 0;
137             cd->my_errno = YAZ_ICONV_EILSEQ;
138         }
139     }
140     else if (inp[0] <= 0xf7 && inbytesleft >= 4)
141     {
142         x =  ((inp[0] & 0x07) << 18) | ((inp[1] & 0x3f) << 12) |
143             ((inp[2] & 0x3f) << 6) | (inp[3] & 0x3f);
144         if (x >= 0x10000)
145             *no_read = 4;
146         else
147         {
148             *no_read = 0;
149             cd->my_errno = YAZ_ICONV_EILSEQ;
150         }
151     }
152     else if (inp[0] <= 0xfb && inbytesleft >= 5)
153     {
154         x =  ((inp[0] & 0x03) << 24) | ((inp[1] & 0x3f) << 18) |
155             ((inp[2] & 0x3f) << 12) | ((inp[3] & 0x3f) << 6) |
156             (inp[4] & 0x3f);
157         if (x >= 0x200000)
158             *no_read = 5;
159         else
160         {
161             *no_read = 0;
162             cd->my_errno = YAZ_ICONV_EILSEQ;
163         }
164     }
165     else if (inp[0] <= 0xfd && inbytesleft >= 6)
166     {
167         x =  ((inp[0] & 0x01) << 30) | ((inp[1] & 0x3f) << 24) |
168             ((inp[2] & 0x3f) << 18) | ((inp[3] & 0x3f) << 12) |
169             ((inp[4] & 0x3f) << 6) | (inp[5] & 0x3f);
170         if (x >= 0x4000000)
171             *no_read = 6;
172         else
173         {
174             *no_read = 0;
175             cd->my_errno = YAZ_ICONV_EILSEQ;
176         }
177     }
178     else
179     {
180         *no_read = 0;
181         cd->my_errno = YAZ_ICONV_EINVAL;
182     }
183     return x;
184 }
185
186 static unsigned long yaz_read_UCS4 (yaz_iconv_t cd, unsigned char *inp,
187                                     size_t inbytesleft, size_t *no_read)
188 {
189     unsigned long x = 0;
190     
191     if (inbytesleft < 4)
192     {
193         cd->my_errno = YAZ_ICONV_EINVAL; /* incomplete input */
194         *no_read = 0;
195     }
196     else
197     {
198         x = (inp[0]<<24) | (inp[1]<<16) | (inp[2]<<8) | inp[3];
199         *no_read = 4;
200     }
201     return x;
202 }
203
204 static unsigned long yaz_read_UCS4LE (yaz_iconv_t cd, unsigned char *inp,
205                                       size_t inbytesleft, size_t *no_read)
206 {
207     unsigned long x = 0;
208     
209     if (inbytesleft < 4)
210     {
211         cd->my_errno = YAZ_ICONV_EINVAL; /* incomplete input */
212         *no_read = 0;
213     }
214     else
215     {
216         x = (inp[3]<<24) | (inp[2]<<16) | (inp[1]<<8) | inp[0];
217         *no_read = 4;
218     }
219     return x;
220 }
221
222 #if HAVE_WCHAR_H
223 static unsigned long yaz_read_wchar_t (yaz_iconv_t cd, unsigned char *inp,
224                                        size_t inbytesleft, size_t *no_read)
225 {
226     unsigned long x = 0;
227     
228     if (inbytesleft < sizeof(wchar_t))
229     {
230         cd->my_errno = YAZ_ICONV_EINVAL; /* incomplete input */
231         *no_read = 0;
232     }
233     else
234     {
235         wchar_t wch;
236         memcpy (&wch, inp, sizeof(wch));
237         x = wch;
238         *no_read = sizeof(wch);
239     }
240     return x;
241 }
242 #endif
243
244 static unsigned long yaz_read_marc8 (yaz_iconv_t cd, unsigned char *inp,
245                                      size_t inbytesleft, size_t *no_read)
246 {
247     if (cd->marc8_comb_x)
248     {
249         unsigned long x = cd->marc8_comb_x;
250         *no_read = cd->marc8_comb_no_read;
251         cd->marc8_comb_x = 0;
252         return x;
253     }
254     *no_read = 0;
255     while(inbytesleft >= 1 && inp[0] == 27)
256     {
257         size_t inbytesleft0 = inbytesleft;
258         inp++;
259         inbytesleft--;
260         while(inbytesleft > 0 && strchr("(,$!", *inp))
261         {
262             inbytesleft--;
263             inp++;
264         }
265         if (inbytesleft <= 0)
266         {
267             *no_read = 0;
268             cd->my_errno = YAZ_ICONV_EINVAL;
269             return 0;
270         }
271         cd->marc8_esc_mode = *inp++;
272         inbytesleft--;
273         (*no_read) += inbytesleft0 - inbytesleft;
274     }
275     if (inbytesleft <= 0)
276         return 0;
277     else
278     {
279         unsigned long x;
280         int comb = 0;
281         size_t no_read_sub = 0;
282
283         switch(cd->marc8_esc_mode)
284         {
285         case 'B':  /* Basic ASCII */
286         case 'E':  /* ANSEL */
287         case 's':  /* ASCII */
288             x = yaz_marc8_1_conv(inp, inbytesleft, &no_read_sub, &comb);
289             break;
290         case 'g':  /* Greek */
291             x = yaz_marc8_2_conv(inp, inbytesleft, &no_read_sub, &comb);
292             break;
293         case 'b':  /* Subscripts */
294             x = yaz_marc8_3_conv(inp, inbytesleft, &no_read_sub, &comb);
295             break;
296         case 'p':  /* Superscripts */
297             x = yaz_marc8_4_conv(inp, inbytesleft, &no_read_sub, &comb);
298             break;
299         case '2':  /* Basic Hebrew */
300             x = yaz_marc8_5_conv(inp, inbytesleft, &no_read_sub, &comb);
301             break;
302         case 'N':  /* Basic Cyrillic */
303         case 'Q':  /* Extended Cyrillic */
304             x = yaz_marc8_6_conv(inp, inbytesleft, &no_read_sub, &comb);
305             break;
306         case '3':  /* Basic Arabic */
307         case '4':  /* Extended Arabic */
308             x = yaz_marc8_7_conv(inp, inbytesleft, &no_read_sub, &comb);
309             break;
310         case 'S':  /* Greek */
311             x = yaz_marc8_8_conv(inp, inbytesleft, &no_read_sub, &comb);
312             break;
313         case '1':  /* Chinese, Japanese, Korean (EACC) */
314             x = yaz_marc8_9_conv(inp, inbytesleft, &no_read_sub, &comb);
315             break;
316         default:
317             *no_read = 0;
318             cd->my_errno = YAZ_ICONV_EILSEQ;
319             return 0;
320         }
321 #if 0
322         printf ("esc mode=%c x=%04lX comb=%d\n", cd->marc8_esc_mode, x, comb);
323 #endif
324         *no_read += no_read_sub;
325
326         if (comb && cd->marc8_comb_x == 0)
327         {
328             size_t tmp_read = 0;
329             unsigned long next_x;
330
331             /* read next char .. */
332             next_x = yaz_read_marc8(cd, inp + *no_read,
333                                     inbytesleft - *no_read, &tmp_read);
334             /* save this x for later .. */
335             cd->marc8_comb_x = x;
336             /* save next read for later .. */
337             cd->marc8_comb_no_read = tmp_read;
338             /* return next x - thereby swap */
339             x = next_x;
340         }
341         return x;
342     }
343 }
344
345 static size_t yaz_write_UTF8 (yaz_iconv_t cd, unsigned long x,
346                               char **outbuf, size_t *outbytesleft)
347 {
348     unsigned char *outp = (unsigned char *) *outbuf;
349     if (x <= 0x7f && *outbytesleft >= 1)
350     {
351         *outp++ = (unsigned char) x;
352         (*outbytesleft)--;
353     } 
354     else if (x <= 0x7ff && *outbytesleft >= 2)
355     {
356         *outp++ = (unsigned char) ((x >> 6) | 0xc0);
357         *outp++ = (unsigned char) ((x & 0x3f) | 0x80);
358         (*outbytesleft) -= 2;
359     }
360     else if (x <= 0xffff && *outbytesleft >= 3)
361     {
362         *outp++ = (unsigned char) ((x >> 12) | 0xe0);
363         *outp++ = (unsigned char) (((x >> 6) & 0x3f) | 0x80);
364         *outp++ = (unsigned char) ((x & 0x3f) | 0x80);
365         (*outbytesleft) -= 3;
366     }
367     else if (x <= 0x1fffff && *outbytesleft >= 4)
368     {
369         *outp++ = (unsigned char) ((x >> 18) | 0xf0);
370         *outp++ = (unsigned char) (((x >> 12) & 0x3f) | 0x80);
371         *outp++ = (unsigned char) (((x >> 6)  & 0x3f) | 0x80);
372         *outp++ = (unsigned char) ((x & 0x3f) | 0x80);
373         (*outbytesleft) -= 4;
374     }
375     else if (x <= 0x3ffffff && *outbytesleft >= 5)
376     {
377         *outp++ = (unsigned char) ((x >> 24) | 0xf8);
378         *outp++ = (unsigned char) (((x >> 18) & 0x3f) | 0x80);
379         *outp++ = (unsigned char) (((x >> 12) & 0x3f) | 0x80);
380         *outp++ = (unsigned char) (((x >> 6)  & 0x3f) | 0x80);
381         *outp++ = (unsigned char) ((x & 0x3f) | 0x80);
382         (*outbytesleft) -= 5;
383     }
384     else if (*outbytesleft >= 6)
385     {
386         *outp++ = (unsigned char) ((x >> 30) | 0xfc);
387         *outp++ = (unsigned char) (((x >> 24) & 0x3f) | 0x80);
388         *outp++ = (unsigned char) (((x >> 18) & 0x3f) | 0x80);
389         *outp++ = (unsigned char) (((x >> 12) & 0x3f) | 0x80);
390         *outp++ = (unsigned char) (((x >> 6)  & 0x3f) | 0x80);
391         *outp++ = (unsigned char) ((x & 0x3f) | 0x80);
392         (*outbytesleft) -= 6;
393     }
394     else 
395     {
396         cd->my_errno = YAZ_ICONV_E2BIG;  /* not room for output */
397         return (size_t)(-1);
398     }
399     *outbuf = (char *) outp;
400     return 0;
401 }
402
403 static size_t yaz_write_ISO8859_1 (yaz_iconv_t cd, unsigned long x,
404                                    char **outbuf, size_t *outbytesleft)
405 {
406     unsigned char *outp = (unsigned char *) *outbuf;
407     if (x > 255 || x < 1)
408     {
409         cd->my_errno = YAZ_ICONV_EILSEQ;
410         return (size_t) -1;
411     }
412     else if (*outbytesleft >= 1)
413     {
414         *outp++ = (unsigned char) x;
415         (*outbytesleft)--;
416     }
417     else 
418     {
419         cd->my_errno = YAZ_ICONV_E2BIG;
420         return (size_t)(-1);
421     }
422     *outbuf = (char *) outp;
423     return 0;
424 }
425
426
427 static size_t yaz_write_UCS4 (yaz_iconv_t cd, unsigned long x,
428                               char **outbuf, size_t *outbytesleft)
429 {
430     unsigned char *outp = (unsigned char *) *outbuf;
431     if (*outbytesleft >= 4)
432     {
433         *outp++ = (unsigned char) (x>>24);
434         *outp++ = (unsigned char) (x>>16);
435         *outp++ = (unsigned char) (x>>8);
436         *outp++ = (unsigned char) x;
437         (*outbytesleft) -= 4;
438     }
439     else
440     {
441         cd->my_errno = YAZ_ICONV_E2BIG;
442         return (size_t)(-1);
443     }
444     *outbuf = (char *) outp;
445     return 0;
446 }
447
448 static size_t yaz_write_UCS4LE (yaz_iconv_t cd, unsigned long x,
449                                 char **outbuf, size_t *outbytesleft)
450 {
451     unsigned char *outp = (unsigned char *) *outbuf;
452     if (*outbytesleft >= 4)
453     {
454         *outp++ = (unsigned char) x;
455         *outp++ = (unsigned char) (x>>8);
456         *outp++ = (unsigned char) (x>>16);
457         *outp++ = (unsigned char) (x>>24);
458         (*outbytesleft) -= 4;
459     }
460     else
461     {
462         cd->my_errno = YAZ_ICONV_E2BIG;
463         return (size_t)(-1);
464     }
465     *outbuf = (char *) outp;
466     return 0;
467 }
468
469 #if HAVE_WCHAR_H
470 static size_t yaz_write_wchar_t (yaz_iconv_t cd, unsigned long x,
471                                  char **outbuf, size_t *outbytesleft)
472 {
473     unsigned char *outp = (unsigned char *) *outbuf;
474
475     if (*outbytesleft >= sizeof(wchar_t))
476     {
477         wchar_t wch = x;
478         memcpy(outp, &wch, sizeof(wch));
479         outp += sizeof(wch);
480         (*outbytesleft) -= sizeof(wch);
481     }
482     else
483     {
484         cd->my_errno = YAZ_ICONV_E2BIG;
485         return (size_t)(-1);
486     }
487     *outbuf = (char *) outp;
488     return 0;
489 }
490 #endif
491
492 int yaz_iconv_isbuiltin(yaz_iconv_t cd)
493 {
494     return cd->read_handle && cd->write_handle;
495 }
496
497 yaz_iconv_t yaz_iconv_open (const char *tocode, const char *fromcode)
498 {
499     yaz_iconv_t cd = (yaz_iconv_t) xmalloc (sizeof(*cd));
500
501     cd->write_handle = 0;
502     cd->read_handle = 0;
503     cd->init_handle = 0;
504     cd->my_errno = YAZ_ICONV_UNKNOWN;
505     cd->marc8_esc_mode = 'B';
506     cd->marc8_comb_x = 0;
507
508     /* a useful hack: if fromcode has leading @,
509        the library not use YAZ's own conversions .. */
510     if (fromcode[0] == '@')
511         fromcode++;
512     else
513     {
514         if (!yaz_matchstr(fromcode, "UTF8"))
515         {
516             cd->read_handle = yaz_read_UTF8;
517             cd->init_handle = yaz_init_UTF8;
518         }
519         else if (!yaz_matchstr(fromcode, "ISO88591"))
520             cd->read_handle = yaz_read_ISO8859_1;
521         else if (!yaz_matchstr(fromcode, "UCS4"))
522             cd->read_handle = yaz_read_UCS4;
523         else if (!yaz_matchstr(fromcode, "UCS4LE"))
524             cd->read_handle = yaz_read_UCS4LE;
525         else if (!yaz_matchstr(fromcode, "MARC8"))
526             cd->read_handle = yaz_read_marc8;
527 #if HAVE_WCHAR_H
528         else if (!yaz_matchstr(fromcode, "WCHAR_T"))
529             cd->read_handle = yaz_read_wchar_t;
530 #endif
531         
532         if (!yaz_matchstr(tocode, "UTF8"))
533             cd->write_handle = yaz_write_UTF8;
534         else if (!yaz_matchstr(tocode, "ISO88591"))
535             cd->write_handle = yaz_write_ISO8859_1;
536         else if (!yaz_matchstr (tocode, "UCS4"))
537             cd->write_handle = yaz_write_UCS4;
538         else if (!yaz_matchstr(tocode, "UCS4LE"))
539             cd->write_handle = yaz_write_UCS4LE;
540 #if HAVE_WCHAR_H
541         else if (!yaz_matchstr(tocode, "WCHAR_T"))
542             cd->write_handle = yaz_write_wchar_t;
543 #endif
544     }
545 #if HAVE_ICONV_H
546     cd->iconv_cd = 0;
547     if (!cd->read_handle || !cd->write_handle)
548     {
549         cd->iconv_cd = iconv_open (tocode, fromcode);
550         if (cd->iconv_cd == (iconv_t) (-1))
551         {
552             xfree (cd);
553             return 0;
554         }
555     }
556 #else
557     if (!cd->read_handle || !cd->write_handle)
558     {
559         xfree (cd);
560         return 0;
561     }
562 #endif
563     cd->init_flag = 1;
564     return cd;
565 }
566
567 size_t yaz_iconv(yaz_iconv_t cd, char **inbuf, size_t *inbytesleft,
568                  char **outbuf, size_t *outbytesleft)
569 {
570     char *inbuf0;
571     size_t r = 0;
572 #if HAVE_ICONV_H
573     if (cd->iconv_cd)
574     {
575         size_t r =
576             iconv(cd->iconv_cd, inbuf, inbytesleft, outbuf, outbytesleft);
577         if (r == (size_t)(-1))
578         {
579             switch (yaz_errno())
580             {
581             case E2BIG:
582                 cd->my_errno = YAZ_ICONV_E2BIG;
583                 break;
584             case EINVAL:
585                 cd->my_errno = YAZ_ICONV_EINVAL;
586                 break;
587             case EILSEQ:
588                 cd->my_errno = YAZ_ICONV_EILSEQ;
589                 break;
590             default:
591                 cd->my_errno = YAZ_ICONV_UNKNOWN;
592             }
593         }
594         return r;
595     }
596 #endif
597     if (inbuf == 0 || *inbuf == 0)
598     {
599         cd->init_flag = 1;
600         cd->my_errno = YAZ_ICONV_UNKNOWN;
601         return 0;
602     }
603     inbuf0 = *inbuf;
604
605     if (cd->init_flag)
606     {
607         if (cd->init_handle)
608         {
609             size_t no_read;
610             size_t r = (cd->init_handle)(cd, (unsigned char *) *inbuf,
611                                          *inbytesleft, &no_read);
612             if (r)
613             {
614                 if (cd->my_errno == YAZ_ICONV_EINVAL)
615                     return r;
616                 cd->init_flag = 0;
617                 return r;
618             }
619             *inbytesleft -= no_read;
620             *inbuf += no_read;
621         }
622         cd->init_flag = 0;
623         cd->unget_x = 0;
624         cd->no_read_x = 0;
625     }
626     while (1)
627     {
628         unsigned long x;
629         size_t no_read;
630
631         if (*inbytesleft == 0)
632         {
633             r = *inbuf - inbuf0;
634             break;
635         }
636         if (!cd->unget_x)
637         {
638             x = (cd->read_handle)(cd, (unsigned char *) *inbuf, *inbytesleft,
639                                   &no_read);
640             if (no_read == 0)
641             {
642                 r = (size_t)(-1);
643                 break;
644             }
645         }
646         else
647         {
648             x = cd->unget_x;
649             no_read = cd->no_read_x;
650         }
651         if (x)
652         {
653             r = (cd->write_handle)(cd, x, outbuf, outbytesleft);
654             if (r)
655             {
656                 /* unable to write it. save it because read_handle cannot
657                    rewind .. */
658                 cd->unget_x = x;
659                 cd->no_read_x = no_read;
660                 break;
661             }
662             cd->unget_x = 0;
663         }
664         *inbytesleft -= no_read;
665         (*inbuf) += no_read;
666     }
667     return r;
668 }
669
670 int yaz_iconv_error (yaz_iconv_t cd)
671 {
672     return cd->my_errno;
673 }
674
675 int yaz_iconv_close (yaz_iconv_t cd)
676 {
677 #if HAVE_ICONV_H
678     if (cd->iconv_cd)
679         iconv_close (cd->iconv_cd);
680 #endif
681     xfree (cd);
682     return 0;
683 }
684
685