c916cad2fbb66013d8955d4a7badb37de54d685d
[yaz-moved-to-github.git] / src / siconv.c
1 /*
2  * Copyright (c) 1997-2004, Index Data
3  * See the file LICENSE for details.
4  *
5  * $Id: siconv.c,v 1.7 2004-10-15 00:19:00 adam Exp $
6  */
7 /**
8  * \file siconv.c
9  * \brief Implements simple ICONV
10  *
11  * This implements an interface similar to that of iconv and
12  * is used by YAZ to interface with iconv (if present).
13  * For systems where iconv is not present, this layer
14  * provides a few important conversion: UTF-8, MARC-8, Latin-1.
15  */
16
17 #if HAVE_CONFIG_H
18 #include <config.h>
19 #endif
20
21 #include <errno.h>
22 #include <string.h>
23 #include <ctype.h>
24 #if HAVE_WCHAR_H
25 #include <wchar.h>
26 #endif
27
28 #if HAVE_ICONV_H
29 #include <iconv.h>
30 #endif
31
32 #include <yaz/yaz-util.h>
33
34 unsigned long yaz_marc8_1_conv (unsigned char *inp, size_t inbytesleft,
35                               size_t *no_read, int *combining);
36 unsigned long yaz_marc8_2_conv (unsigned char *inp, size_t inbytesleft,
37                                 size_t *no_read, int *combining);
38 unsigned long yaz_marc8_3_conv (unsigned char *inp, size_t inbytesleft,
39                                 size_t *no_read, int *combining);
40 unsigned long yaz_marc8_4_conv (unsigned char *inp, size_t inbytesleft,
41                                 size_t *no_read, int *combining);
42 unsigned long yaz_marc8_5_conv (unsigned char *inp, size_t inbytesleft,
43                                 size_t *no_read, int *combining);
44 unsigned long yaz_marc8_6_conv (unsigned char *inp, size_t inbytesleft,
45                                 size_t *no_read, int *combining);
46 unsigned long yaz_marc8_7_conv (unsigned char *inp, size_t inbytesleft,
47                                 size_t *no_read, int *combining);
48 unsigned long yaz_marc8_8_conv (unsigned char *inp, size_t inbytesleft,
49                                 size_t *no_read, int *combining);
50 unsigned long yaz_marc8_9_conv (unsigned char *inp, size_t inbytesleft,
51                                 size_t *no_read, int *combining);
52     
53 struct yaz_iconv_struct {
54     int my_errno;
55     int init_flag;
56     size_t (*init_handle)(yaz_iconv_t cd, unsigned char *inbuf,
57                           size_t inbytesleft, size_t *no_read);
58     unsigned long (*read_handle)(yaz_iconv_t cd, unsigned char *inbuf,
59                                  size_t inbytesleft, size_t *no_read);
60     size_t (*write_handle)(yaz_iconv_t cd, unsigned long x,
61                            char **outbuf, size_t *outbytesleft);
62     int marc8_esc_mode;
63     int marc8_comb_x;
64     int marc8_comb_no_read;
65 #if HAVE_ICONV_H
66     iconv_t iconv_cd;
67 #endif
68 };
69
70 static unsigned long yaz_read_ISO8859_1 (yaz_iconv_t cd, unsigned char *inp,
71                                          size_t inbytesleft, size_t *no_read)
72 {
73     unsigned long x = inp[0];
74     *no_read = 1;
75     return x;
76 }
77
78 static size_t yaz_init_UTF8 (yaz_iconv_t cd, unsigned char *inp,
79                              size_t inbytesleft, size_t *no_read)
80 {
81     if (inp[0] != 0xef)
82     {
83         *no_read = 0;
84         return 0;
85     }
86     if (inbytesleft < 3)
87     {
88         cd->my_errno = YAZ_ICONV_EINVAL;
89         return (size_t) -1;
90     }
91     if (inp[1] != 0xbb || inp[2] != 0xbf)
92     {
93         cd->my_errno = YAZ_ICONV_EILSEQ;
94         return (size_t) -1;
95     }
96     *no_read = 3;
97     return 0;
98 }
99
100 static unsigned long yaz_read_UTF8 (yaz_iconv_t cd, unsigned char *inp,
101                                     size_t inbytesleft, size_t *no_read)
102 {
103     unsigned long x = 0;
104
105     if (inp[0] <= 0x7f)
106     {
107         x = inp[0];
108         *no_read = 1;
109     }
110     else if (inp[0] <= 0xbf || inp[0] >= 0xfe)
111     {
112         *no_read = 0;
113         cd->my_errno = YAZ_ICONV_EILSEQ;
114     }
115     else if (inp[0] <= 0xdf && inbytesleft >= 2)
116     {
117         x = ((inp[0] & 0x1f) << 6) | (inp[1] & 0x3f);
118         if (x >= 0x80)
119             *no_read = 2;
120         else
121         {
122             *no_read = 0;
123             cd->my_errno = YAZ_ICONV_EILSEQ;
124         }
125     }
126     else if (inp[0] <= 0xef && inbytesleft >= 3)
127     {
128         x = ((inp[0] & 0x0f) << 12) | ((inp[1] & 0x3f) << 6) |
129             (inp[1] & 0x3f);
130         if (x >= 0x800)
131             *no_read = 3;
132         else
133         {
134             *no_read = 0;
135             cd->my_errno = YAZ_ICONV_EILSEQ;
136         }
137     }
138     else if (inp[0] <= 0xf7 && inbytesleft >= 4)
139     {
140         x =  ((inp[0] & 0x07) << 18) | ((inp[1] & 0x3f) << 12) |
141             ((inp[2] & 0x3f) << 6) | (inp[3] & 0x3f);
142         if (x >= 0x10000)
143             *no_read = 4;
144         else
145         {
146             *no_read = 0;
147             cd->my_errno = YAZ_ICONV_EILSEQ;
148         }
149     }
150     else if (inp[0] <= 0xfb && inbytesleft >= 5)
151     {
152         x =  ((inp[0] & 0x03) << 24) | ((inp[1] & 0x3f) << 18) |
153             ((inp[2] & 0x3f) << 12) | ((inp[3] & 0x3f) << 6) |
154             (inp[4] & 0x3f);
155         if (x >= 0x200000)
156             *no_read = 5;
157         else
158         {
159             *no_read = 0;
160             cd->my_errno = YAZ_ICONV_EILSEQ;
161         }
162     }
163     else if (inp[0] <= 0xfd && inbytesleft >= 6)
164     {
165         x =  ((inp[0] & 0x01) << 30) | ((inp[1] & 0x3f) << 24) |
166             ((inp[2] & 0x3f) << 18) | ((inp[3] & 0x3f) << 12) |
167             ((inp[4] & 0x3f) << 6) | (inp[5] & 0x3f);
168         if (x >= 0x4000000)
169             *no_read = 6;
170         else
171         {
172             *no_read = 0;
173             cd->my_errno = YAZ_ICONV_EILSEQ;
174         }
175     }
176     else
177     {
178         *no_read = 0;
179         cd->my_errno = YAZ_ICONV_EINVAL;
180     }
181     return x;
182 }
183
184 static unsigned long yaz_read_UCS4 (yaz_iconv_t cd, unsigned char *inp,
185                                     size_t inbytesleft, size_t *no_read)
186 {
187     unsigned long x = 0;
188     
189     if (inbytesleft < 4)
190     {
191         cd->my_errno = YAZ_ICONV_EINVAL; /* incomplete input */
192         *no_read = 0;
193     }
194     else
195     {
196         x = (inp[0]<<24) | (inp[1]<<16) | (inp[2]<<8) | inp[3];
197         *no_read = 4;
198     }
199     return x;
200 }
201
202 static unsigned long yaz_read_UCS4LE (yaz_iconv_t cd, unsigned char *inp,
203                                       size_t inbytesleft, size_t *no_read)
204 {
205     unsigned long x = 0;
206     
207     if (inbytesleft < 4)
208     {
209         cd->my_errno = YAZ_ICONV_EINVAL; /* incomplete input */
210         *no_read = 0;
211     }
212     else
213     {
214         x = (inp[3]<<24) | (inp[2]<<16) | (inp[1]<<8) | inp[0];
215         *no_read = 4;
216     }
217     return x;
218 }
219
220 #if HAVE_WCHAR_H
221 static unsigned long yaz_read_wchar_t (yaz_iconv_t cd, unsigned char *inp,
222                                        size_t inbytesleft, size_t *no_read)
223 {
224     unsigned long x = 0;
225     
226     if (inbytesleft < sizeof(wchar_t))
227     {
228         cd->my_errno = YAZ_ICONV_EINVAL; /* incomplete input */
229         *no_read = 0;
230     }
231     else
232     {
233         wchar_t wch;
234         memcpy (&wch, inp, sizeof(wch));
235         x = wch;
236         *no_read = sizeof(wch);
237     }
238     return x;
239 }
240 #endif
241
242 static unsigned long yaz_read_marc8 (yaz_iconv_t cd, unsigned char *inp,
243                                      size_t inbytesleft, size_t *no_read)
244 {
245     if (cd->marc8_comb_x)
246     {
247         unsigned long x = cd->marc8_comb_x;
248         *no_read = cd->marc8_comb_no_read;
249         cd->marc8_comb_x = 0;
250         return x;
251     }
252     *no_read = 0;
253     while(inbytesleft >= 1 && inp[0] == 27)
254     {
255         size_t inbytesleft0 = inbytesleft;
256         inp++;
257         inbytesleft--;
258         while(inbytesleft > 0 && strchr("(,$!", *inp))
259         {
260             inbytesleft--;
261             inp++;
262         }
263         if (inbytesleft <= 0)
264         {
265             *no_read = 0;
266             cd->my_errno = YAZ_ICONV_EINVAL;
267             return 0;
268         }
269         cd->marc8_esc_mode = *inp++;
270         inbytesleft--;
271         (*no_read) += inbytesleft0 - inbytesleft;
272     }
273     if (inbytesleft <= 0)
274         return 0;
275     else
276     {
277         unsigned long x;
278         int comb = 0;
279         size_t no_read_sub = 0;
280
281         switch(cd->marc8_esc_mode)
282         {
283         case 'B':  /* Basic ASCII */
284         case 'E':  /* ANSEL */
285         case 's':  /* ASCII */
286             x = yaz_marc8_1_conv(inp, inbytesleft, &no_read_sub, &comb);
287             break;
288         case 'g':  /* Greek */
289             x = yaz_marc8_2_conv(inp, inbytesleft, &no_read_sub, &comb);
290             break;
291         case 'b':  /* Subscripts */
292             x = yaz_marc8_3_conv(inp, inbytesleft, &no_read_sub, &comb);
293             break;
294         case 'p':  /* Superscripts */
295             x = yaz_marc8_4_conv(inp, inbytesleft, &no_read_sub, &comb);
296             break;
297         case '2':  /* Basic Hebrew */
298             x = yaz_marc8_5_conv(inp, inbytesleft, &no_read_sub, &comb);
299             break;
300         case 'N':  /* Basic Cyrillic */
301         case 'Q':  /* Extended Cyrillic */
302             x = yaz_marc8_6_conv(inp, inbytesleft, &no_read_sub, &comb);
303             break;
304         case '3':  /* Basic Arabic */
305         case '4':  /* Extended Arabic */
306             x = yaz_marc8_7_conv(inp, inbytesleft, &no_read_sub, &comb);
307             break;
308         case 'S':  /* Greek */
309             x = yaz_marc8_8_conv(inp, inbytesleft, &no_read_sub, &comb);
310             break;
311         case '1':  /* Chinese, Japanese, Korean (EACC) */
312             x = yaz_marc8_9_conv(inp, inbytesleft, &no_read_sub, &comb);
313             break;
314         default:
315             *no_read = 0;
316             cd->my_errno = YAZ_ICONV_EILSEQ;
317             return 0;
318         }
319 #if 0
320         printf ("esc mode=%c x=%04lX comb=%d\n", cd->marc8_esc_mode, x, comb);
321 #endif
322         *no_read += no_read_sub;
323
324         if (comb && cd->marc8_comb_x == 0)
325         {
326             size_t tmp_read = 0;
327             unsigned long next_x;
328
329             /* read next char .. */
330             next_x = yaz_read_marc8(cd, inp + *no_read,
331                                     inbytesleft - *no_read, &tmp_read);
332             /* save this x for later .. */
333             cd->marc8_comb_x = x;
334             /* save next read for later .. */
335             cd->marc8_comb_no_read = tmp_read;
336             /* return next x - thereby swap */
337             x = next_x;
338         }
339         return x;
340     }
341 }
342
343 static size_t yaz_write_UTF8 (yaz_iconv_t cd, unsigned long x,
344                               char **outbuf, size_t *outbytesleft)
345 {
346     unsigned char *outp = (unsigned char *) *outbuf;
347     if (x <= 0x7f && *outbytesleft >= 1)
348     {
349         *outp++ = (unsigned char) x;
350         (*outbytesleft)--;
351     } 
352     else if (x <= 0x7ff && *outbytesleft >= 2)
353     {
354         *outp++ = (unsigned char) ((x >> 6) | 0xc0);
355         *outp++ = (unsigned char) ((x & 0x3f) | 0x80);
356         (*outbytesleft) -= 2;
357     }
358     else if (x <= 0xffff && *outbytesleft >= 3)
359     {
360         *outp++ = (unsigned char) ((x >> 12) | 0xe0);
361         *outp++ = (unsigned char) (((x >> 6) & 0x3f) | 0x80);
362         *outp++ = (unsigned char) ((x & 0x3f) | 0x80);
363         (*outbytesleft) -= 3;
364     }
365     else if (x <= 0x1fffff && *outbytesleft >= 4)
366     {
367         *outp++ = (unsigned char) ((x >> 18) | 0xf0);
368         *outp++ = (unsigned char) (((x >> 12) & 0x3f) | 0x80);
369         *outp++ = (unsigned char) (((x >> 6)  & 0x3f) | 0x80);
370         *outp++ = (unsigned char) ((x & 0x3f) | 0x80);
371         (*outbytesleft) -= 4;
372     }
373     else if (x <= 0x3ffffff && *outbytesleft >= 5)
374     {
375         *outp++ = (unsigned char) ((x >> 24) | 0xf8);
376         *outp++ = (unsigned char) (((x >> 18) & 0x3f) | 0x80);
377         *outp++ = (unsigned char) (((x >> 12) & 0x3f) | 0x80);
378         *outp++ = (unsigned char) (((x >> 6)  & 0x3f) | 0x80);
379         *outp++ = (unsigned char) ((x & 0x3f) | 0x80);
380         (*outbytesleft) -= 5;
381     }
382     else if (*outbytesleft >= 6)
383     {
384         *outp++ = (unsigned char) ((x >> 30) | 0xfc);
385         *outp++ = (unsigned char) (((x >> 24) & 0x3f) | 0x80);
386         *outp++ = (unsigned char) (((x >> 18) & 0x3f) | 0x80);
387         *outp++ = (unsigned char) (((x >> 12) & 0x3f) | 0x80);
388         *outp++ = (unsigned char) (((x >> 6)  & 0x3f) | 0x80);
389         *outp++ = (unsigned char) ((x & 0x3f) | 0x80);
390         (*outbytesleft) -= 6;
391     }
392     else 
393     {
394         cd->my_errno = YAZ_ICONV_E2BIG;  /* not room for output */
395         return (size_t)(-1);
396     }
397     *outbuf = (char *) outp;
398     return 0;
399 }
400
401 static size_t yaz_write_ISO8859_1 (yaz_iconv_t cd, unsigned long x,
402                                    char **outbuf, size_t *outbytesleft)
403 {
404     unsigned char *outp = (unsigned char *) *outbuf;
405     if (x > 255 || x < 1)
406     {
407         cd->my_errno = YAZ_ICONV_EILSEQ;
408         return (size_t) -1;
409     }
410     else if (*outbytesleft >= 1)
411     {
412         *outp++ = (unsigned char) x;
413         (*outbytesleft)--;
414     }
415     else 
416     {
417         cd->my_errno = YAZ_ICONV_E2BIG;
418         return (size_t)(-1);
419     }
420     *outbuf = (char *) outp;
421     return 0;
422 }
423
424
425 static size_t yaz_write_UCS4 (yaz_iconv_t cd, unsigned long x,
426                               char **outbuf, size_t *outbytesleft)
427 {
428     unsigned char *outp = (unsigned char *) *outbuf;
429     if (*outbytesleft >= 4)
430     {
431         *outp++ = (unsigned char) (x>>24);
432         *outp++ = (unsigned char) (x>>16);
433         *outp++ = (unsigned char) (x>>8);
434         *outp++ = (unsigned char) x;
435         (*outbytesleft) -= 4;
436     }
437     else
438     {
439         cd->my_errno = YAZ_ICONV_E2BIG;
440         return (size_t)(-1);
441     }
442     *outbuf = (char *) outp;
443     return 0;
444 }
445
446 static size_t yaz_write_UCS4LE (yaz_iconv_t cd, unsigned long x,
447                                 char **outbuf, size_t *outbytesleft)
448 {
449     unsigned char *outp = (unsigned char *) *outbuf;
450     if (*outbytesleft >= 4)
451     {
452         *outp++ = (unsigned char) x;
453         *outp++ = (unsigned char) (x>>8);
454         *outp++ = (unsigned char) (x>>16);
455         *outp++ = (unsigned char) (x>>24);
456         (*outbytesleft) -= 4;
457     }
458     else
459     {
460         cd->my_errno = YAZ_ICONV_E2BIG;
461         return (size_t)(-1);
462     }
463     *outbuf = (char *) outp;
464     return 0;
465 }
466
467 #if HAVE_WCHAR_H
468 static size_t yaz_write_wchar_t (yaz_iconv_t cd, unsigned long x,
469                                  char **outbuf, size_t *outbytesleft)
470 {
471     unsigned char *outp = (unsigned char *) *outbuf;
472
473     if (*outbytesleft >= sizeof(wchar_t))
474     {
475         wchar_t wch = x;
476         memcpy(outp, &wch, sizeof(wch));
477         outp += sizeof(wch);
478         (*outbytesleft) -= sizeof(wch);
479     }
480     else
481     {
482         cd->my_errno = YAZ_ICONV_E2BIG;
483         return (size_t)(-1);
484     }
485     *outbuf = (char *) outp;
486     return 0;
487 }
488 #endif
489
490 int yaz_iconv_isbuiltin(yaz_iconv_t cd)
491 {
492     return cd->read_handle && cd->write_handle;
493 }
494
495 yaz_iconv_t yaz_iconv_open (const char *tocode, const char *fromcode)
496 {
497     yaz_iconv_t cd = (yaz_iconv_t) xmalloc (sizeof(*cd));
498
499     cd->write_handle = 0;
500     cd->read_handle = 0;
501     cd->init_handle = 0;
502     cd->my_errno = YAZ_ICONV_UNKNOWN;
503     cd->marc8_esc_mode = 'B';
504     cd->marc8_comb_x = 0;
505
506     /* a useful hack: if fromcode has leading @,
507        the library not use YAZ's own conversions .. */
508     if (fromcode[0] == '@')
509         fromcode++;
510     else
511     {
512         if (!yaz_matchstr(fromcode, "UTF8"))
513         {
514             cd->read_handle = yaz_read_UTF8;
515             cd->init_handle = yaz_init_UTF8;
516         }
517         else if (!yaz_matchstr(fromcode, "ISO88591"))
518             cd->read_handle = yaz_read_ISO8859_1;
519         else if (!yaz_matchstr(fromcode, "UCS4"))
520             cd->read_handle = yaz_read_UCS4;
521         else if (!yaz_matchstr(fromcode, "UCS4LE"))
522             cd->read_handle = yaz_read_UCS4LE;
523         else if (!yaz_matchstr(fromcode, "MARC8"))
524             cd->read_handle = yaz_read_marc8;
525 #if HAVE_WCHAR_H
526         else if (!yaz_matchstr(fromcode, "WCHAR_T"))
527             cd->read_handle = yaz_read_wchar_t;
528 #endif
529         
530         if (!yaz_matchstr(tocode, "UTF8"))
531             cd->write_handle = yaz_write_UTF8;
532         else if (!yaz_matchstr(tocode, "ISO88591"))
533             cd->write_handle = yaz_write_ISO8859_1;
534         else if (!yaz_matchstr (tocode, "UCS4"))
535             cd->write_handle = yaz_write_UCS4;
536         else if (!yaz_matchstr(tocode, "UCS4LE"))
537             cd->write_handle = yaz_write_UCS4LE;
538 #if HAVE_WCHAR_H
539         else if (!yaz_matchstr(tocode, "WCHAR_T"))
540             cd->write_handle = yaz_write_wchar_t;
541 #endif
542     }
543 #if HAVE_ICONV_H
544     cd->iconv_cd = 0;
545     if (!cd->read_handle || !cd->write_handle)
546     {
547         cd->iconv_cd = iconv_open (tocode, fromcode);
548         if (cd->iconv_cd == (iconv_t) (-1))
549         {
550             xfree (cd);
551             return 0;
552         }
553     }
554 #else
555     if (!cd->read_handle || !cd->write_handle)
556     {
557         xfree (cd);
558         return 0;
559     }
560 #endif
561     cd->init_flag = 1;
562     return cd;
563 }
564
565 size_t yaz_iconv (yaz_iconv_t cd, char **inbuf, size_t *inbytesleft,
566                   char **outbuf, size_t *outbytesleft)
567 {
568     char *inbuf0;
569     size_t r = 0;
570 #if HAVE_ICONV_H
571     if (cd->iconv_cd)
572     {
573         size_t r =
574             iconv(cd->iconv_cd, inbuf, inbytesleft, outbuf, outbytesleft);
575         if (r == (size_t)(-1))
576         {
577             switch (yaz_errno())
578             {
579             case E2BIG:
580                 cd->my_errno = YAZ_ICONV_E2BIG;
581                 break;
582             case EINVAL:
583                 cd->my_errno = YAZ_ICONV_EINVAL;
584                 break;
585             case EILSEQ:
586                 cd->my_errno = YAZ_ICONV_EILSEQ;
587                 break;
588             default:
589                 cd->my_errno = YAZ_ICONV_UNKNOWN;
590             }
591         }
592         return r;
593     }
594 #endif
595     if (inbuf == 0 || *inbuf == 0)
596     {
597         cd->init_flag = 1;
598         cd->my_errno = YAZ_ICONV_UNKNOWN;
599         return 0;
600     }
601     inbuf0 = *inbuf;
602
603     if (cd->init_flag)
604     {
605         if (cd->init_handle)
606         {
607             size_t no_read;
608             size_t r = (cd->init_handle)(cd, (unsigned char *) *inbuf,
609                                          *inbytesleft, &no_read);
610             if (r)
611             {
612                 if (cd->my_errno == YAZ_ICONV_EINVAL)
613                     return r;
614                 cd->init_flag = 0;
615                 return r;
616             }
617             *inbytesleft -= no_read;
618             *inbuf += no_read;
619         }
620         cd->init_flag = 0;
621     }
622     while (1)
623     {
624         unsigned long x;
625         size_t no_read;
626
627         if (*inbytesleft == 0)
628         {
629             r = *inbuf - inbuf0;
630             break;
631         }
632         
633         x = (cd->read_handle)(cd, (unsigned char *) *inbuf, *inbytesleft,
634                               &no_read);
635         if (no_read == 0)
636         {
637             r = (size_t)(-1);
638             break;
639         }
640         if (x)
641         {
642             r = (cd->write_handle)(cd, x, outbuf, outbytesleft);
643             if (r)
644                 break;
645         }
646         *inbytesleft -= no_read;
647         (*inbuf) += no_read;
648     }
649     return r;
650 }
651
652 int yaz_iconv_error (yaz_iconv_t cd)
653 {
654     return cd->my_errno;
655 }
656
657 int yaz_iconv_close (yaz_iconv_t cd)
658 {
659 #if HAVE_ICONV_H
660     if (cd->iconv_cd)
661         iconv_close (cd->iconv_cd);
662 #endif
663     xfree (cd);
664     return 0;
665 }
666
667