7d31a00caa2fd64cb8f9e5edfe4c17c1625df352
[yaz-moved-to-github.git] / src / siconv.c
1 /*
2  * Copyright (c) 1997-2004, Index Data
3  * See the file LICENSE for details.
4  *
5  * $Id: siconv.c,v 1.6 2004-08-07 08:18:19 adam Exp $
6  */
7
8 /* mini iconv and wrapper for system iconv library (if present) */
9
10 #if HAVE_CONFIG_H
11 #include <config.h>
12 #endif
13
14 #include <errno.h>
15 #include <string.h>
16 #include <ctype.h>
17 #if HAVE_WCHAR_H
18 #include <wchar.h>
19 #endif
20
21 #if HAVE_ICONV_H
22 #include <iconv.h>
23 #endif
24
25 #include <yaz/yaz-util.h>
26
27 unsigned long yaz_marc8_1_conv (unsigned char *inp, size_t inbytesleft,
28                               size_t *no_read, int *combining);
29 unsigned long yaz_marc8_2_conv (unsigned char *inp, size_t inbytesleft,
30                                 size_t *no_read, int *combining);
31 unsigned long yaz_marc8_3_conv (unsigned char *inp, size_t inbytesleft,
32                                 size_t *no_read, int *combining);
33 unsigned long yaz_marc8_4_conv (unsigned char *inp, size_t inbytesleft,
34                                 size_t *no_read, int *combining);
35 unsigned long yaz_marc8_5_conv (unsigned char *inp, size_t inbytesleft,
36                                 size_t *no_read, int *combining);
37 unsigned long yaz_marc8_6_conv (unsigned char *inp, size_t inbytesleft,
38                                 size_t *no_read, int *combining);
39 unsigned long yaz_marc8_7_conv (unsigned char *inp, size_t inbytesleft,
40                                 size_t *no_read, int *combining);
41 unsigned long yaz_marc8_8_conv (unsigned char *inp, size_t inbytesleft,
42                                 size_t *no_read, int *combining);
43 unsigned long yaz_marc8_9_conv (unsigned char *inp, size_t inbytesleft,
44                                 size_t *no_read, int *combining);
45     
46 struct yaz_iconv_struct {
47     int my_errno;
48     int init_flag;
49     size_t (*init_handle)(yaz_iconv_t cd, unsigned char *inbuf,
50                           size_t inbytesleft, size_t *no_read);
51     unsigned long (*read_handle)(yaz_iconv_t cd, unsigned char *inbuf,
52                                  size_t inbytesleft, size_t *no_read);
53     size_t (*write_handle)(yaz_iconv_t cd, unsigned long x,
54                            char **outbuf, size_t *outbytesleft);
55     int marc8_esc_mode;
56     int marc8_comb_x;
57     int marc8_comb_no_read;
58 #if HAVE_ICONV_H
59     iconv_t iconv_cd;
60 #endif
61 };
62
63 static unsigned long yaz_read_ISO8859_1 (yaz_iconv_t cd, unsigned char *inp,
64                                          size_t inbytesleft, size_t *no_read)
65 {
66     unsigned long x = inp[0];
67     *no_read = 1;
68     return x;
69 }
70
71 static size_t yaz_init_UTF8 (yaz_iconv_t cd, unsigned char *inp,
72                              size_t inbytesleft, size_t *no_read)
73 {
74     if (inp[0] != 0xef)
75     {
76         *no_read = 0;
77         return 0;
78     }
79     if (inbytesleft < 3)
80     {
81         cd->my_errno = YAZ_ICONV_EINVAL;
82         return (size_t) -1;
83     }
84     if (inp[1] != 0xbb || inp[2] != 0xbf)
85     {
86         cd->my_errno = YAZ_ICONV_EILSEQ;
87         return (size_t) -1;
88     }
89     *no_read = 3;
90     return 0;
91 }
92
93 static unsigned long yaz_read_UTF8 (yaz_iconv_t cd, unsigned char *inp,
94                                     size_t inbytesleft, size_t *no_read)
95 {
96     unsigned long x = 0;
97
98     if (inp[0] <= 0x7f)
99     {
100         x = inp[0];
101         *no_read = 1;
102     }
103     else if (inp[0] <= 0xbf || inp[0] >= 0xfe)
104     {
105         *no_read = 0;
106         cd->my_errno = YAZ_ICONV_EILSEQ;
107     }
108     else if (inp[0] <= 0xdf && inbytesleft >= 2)
109     {
110         x = ((inp[0] & 0x1f) << 6) | (inp[1] & 0x3f);
111         if (x >= 0x80)
112             *no_read = 2;
113         else
114         {
115             *no_read = 0;
116             cd->my_errno = YAZ_ICONV_EILSEQ;
117         }
118     }
119     else if (inp[0] <= 0xef && inbytesleft >= 3)
120     {
121         x = ((inp[0] & 0x0f) << 12) | ((inp[1] & 0x3f) << 6) |
122             (inp[1] & 0x3f);
123         if (x >= 0x800)
124             *no_read = 3;
125         else
126         {
127             *no_read = 0;
128             cd->my_errno = YAZ_ICONV_EILSEQ;
129         }
130     }
131     else if (inp[0] <= 0xf7 && inbytesleft >= 4)
132     {
133         x =  ((inp[0] & 0x07) << 18) | ((inp[1] & 0x3f) << 12) |
134             ((inp[2] & 0x3f) << 6) | (inp[3] & 0x3f);
135         if (x >= 0x10000)
136             *no_read = 4;
137         else
138         {
139             *no_read = 0;
140             cd->my_errno = YAZ_ICONV_EILSEQ;
141         }
142     }
143     else if (inp[0] <= 0xfb && inbytesleft >= 5)
144     {
145         x =  ((inp[0] & 0x03) << 24) | ((inp[1] & 0x3f) << 18) |
146             ((inp[2] & 0x3f) << 12) | ((inp[3] & 0x3f) << 6) |
147             (inp[4] & 0x3f);
148         if (x >= 0x200000)
149             *no_read = 5;
150         else
151         {
152             *no_read = 0;
153             cd->my_errno = YAZ_ICONV_EILSEQ;
154         }
155     }
156     else if (inp[0] <= 0xfd && inbytesleft >= 6)
157     {
158         x =  ((inp[0] & 0x01) << 30) | ((inp[1] & 0x3f) << 24) |
159             ((inp[2] & 0x3f) << 18) | ((inp[3] & 0x3f) << 12) |
160             ((inp[4] & 0x3f) << 6) | (inp[5] & 0x3f);
161         if (x >= 0x4000000)
162             *no_read = 6;
163         else
164         {
165             *no_read = 0;
166             cd->my_errno = YAZ_ICONV_EILSEQ;
167         }
168     }
169     else
170     {
171         *no_read = 0;
172         cd->my_errno = YAZ_ICONV_EINVAL;
173     }
174     return x;
175 }
176
177 static unsigned long yaz_read_UCS4 (yaz_iconv_t cd, unsigned char *inp,
178                                     size_t inbytesleft, size_t *no_read)
179 {
180     unsigned long x = 0;
181     
182     if (inbytesleft < 4)
183     {
184         cd->my_errno = YAZ_ICONV_EINVAL; /* incomplete input */
185         *no_read = 0;
186     }
187     else
188     {
189         x = (inp[0]<<24) | (inp[1]<<16) | (inp[2]<<8) | inp[3];
190         *no_read = 4;
191     }
192     return x;
193 }
194
195 static unsigned long yaz_read_UCS4LE (yaz_iconv_t cd, unsigned char *inp,
196                                       size_t inbytesleft, size_t *no_read)
197 {
198     unsigned long x = 0;
199     
200     if (inbytesleft < 4)
201     {
202         cd->my_errno = YAZ_ICONV_EINVAL; /* incomplete input */
203         *no_read = 0;
204     }
205     else
206     {
207         x = (inp[3]<<24) | (inp[2]<<16) | (inp[1]<<8) | inp[0];
208         *no_read = 4;
209     }
210     return x;
211 }
212
213 #if HAVE_WCHAR_H
214 static unsigned long yaz_read_wchar_t (yaz_iconv_t cd, unsigned char *inp,
215                                        size_t inbytesleft, size_t *no_read)
216 {
217     unsigned long x = 0;
218     
219     if (inbytesleft < sizeof(wchar_t))
220     {
221         cd->my_errno = YAZ_ICONV_EINVAL; /* incomplete input */
222         *no_read = 0;
223     }
224     else
225     {
226         wchar_t wch;
227         memcpy (&wch, inp, sizeof(wch));
228         x = wch;
229         *no_read = sizeof(wch);
230     }
231     return x;
232 }
233 #endif
234
235 static unsigned long yaz_read_marc8 (yaz_iconv_t cd, unsigned char *inp,
236                                      size_t inbytesleft, size_t *no_read)
237 {
238     if (cd->marc8_comb_x)
239     {
240         unsigned long x = cd->marc8_comb_x;
241         *no_read = cd->marc8_comb_no_read;
242         cd->marc8_comb_x = 0;
243         return x;
244     }
245     *no_read = 0;
246     while(inbytesleft >= 1 && inp[0] == 27)
247     {
248         size_t inbytesleft0 = inbytesleft;
249         inp++;
250         inbytesleft--;
251         while(inbytesleft > 0 && strchr("(,$!", *inp))
252         {
253             inbytesleft--;
254             inp++;
255         }
256         if (inbytesleft <= 0)
257         {
258             *no_read = 0;
259             cd->my_errno = YAZ_ICONV_EINVAL;
260             return 0;
261         }
262         cd->marc8_esc_mode = *inp++;
263         inbytesleft--;
264         (*no_read) += inbytesleft0 - inbytesleft;
265     }
266     if (inbytesleft <= 0)
267         return 0;
268     else
269     {
270         unsigned long x;
271         int comb = 0;
272         size_t no_read_sub = 0;
273
274         switch(cd->marc8_esc_mode)
275         {
276         case 'B':  /* Basic ASCII */
277         case 'E':  /* ANSEL */
278         case 's':  /* ASCII */
279             x = yaz_marc8_1_conv(inp, inbytesleft, &no_read_sub, &comb);
280             break;
281         case 'g':  /* Greek */
282             x = yaz_marc8_2_conv(inp, inbytesleft, &no_read_sub, &comb);
283             break;
284         case 'b':  /* Subscripts */
285             x = yaz_marc8_3_conv(inp, inbytesleft, &no_read_sub, &comb);
286             break;
287         case 'p':  /* Superscripts */
288             x = yaz_marc8_4_conv(inp, inbytesleft, &no_read_sub, &comb);
289             break;
290         case '2':  /* Basic Hebrew */
291             x = yaz_marc8_5_conv(inp, inbytesleft, &no_read_sub, &comb);
292             break;
293         case 'N':  /* Basic Cyrillic */
294         case 'Q':  /* Extended Cyrillic */
295             x = yaz_marc8_6_conv(inp, inbytesleft, &no_read_sub, &comb);
296             break;
297         case '3':  /* Basic Arabic */
298         case '4':  /* Extended Arabic */
299             x = yaz_marc8_7_conv(inp, inbytesleft, &no_read_sub, &comb);
300             break;
301         case 'S':  /* Greek */
302             x = yaz_marc8_8_conv(inp, inbytesleft, &no_read_sub, &comb);
303             break;
304         case '1':  /* Chinese, Japanese, Korean (EACC) */
305             x = yaz_marc8_9_conv(inp, inbytesleft, &no_read_sub, &comb);
306             break;
307         default:
308             *no_read = 0;
309             cd->my_errno = YAZ_ICONV_EILSEQ;
310             return 0;
311         }
312 #if 0
313         printf ("esc mode=%c x=%04lX comb=%d\n", cd->marc8_esc_mode, x, comb);
314 #endif
315         *no_read += no_read_sub;
316
317         if (comb && cd->marc8_comb_x == 0)
318         {
319             size_t tmp_read = 0;
320             unsigned long next_x;
321
322             /* read next char .. */
323             next_x = yaz_read_marc8(cd, inp + *no_read,
324                                     inbytesleft - *no_read, &tmp_read);
325             /* save this x for later .. */
326             cd->marc8_comb_x = x;
327             /* save next read for later .. */
328             cd->marc8_comb_no_read = tmp_read;
329             /* return next x - thereby swap */
330             x = next_x;
331         }
332         return x;
333     }
334 }
335
336 static size_t yaz_write_UTF8 (yaz_iconv_t cd, unsigned long x,
337                               char **outbuf, size_t *outbytesleft)
338 {
339     unsigned char *outp = (unsigned char *) *outbuf;
340     if (x <= 0x7f && *outbytesleft >= 1)
341     {
342         *outp++ = (unsigned char) x;
343         (*outbytesleft)--;
344     } 
345     else if (x <= 0x7ff && *outbytesleft >= 2)
346     {
347         *outp++ = (unsigned char) ((x >> 6) | 0xc0);
348         *outp++ = (unsigned char) ((x & 0x3f) | 0x80);
349         (*outbytesleft) -= 2;
350     }
351     else if (x <= 0xffff && *outbytesleft >= 3)
352     {
353         *outp++ = (unsigned char) ((x >> 12) | 0xe0);
354         *outp++ = (unsigned char) (((x >> 6) & 0x3f) | 0x80);
355         *outp++ = (unsigned char) ((x & 0x3f) | 0x80);
356         (*outbytesleft) -= 3;
357     }
358     else if (x <= 0x1fffff && *outbytesleft >= 4)
359     {
360         *outp++ = (unsigned char) ((x >> 18) | 0xf0);
361         *outp++ = (unsigned char) (((x >> 12) & 0x3f) | 0x80);
362         *outp++ = (unsigned char) (((x >> 6)  & 0x3f) | 0x80);
363         *outp++ = (unsigned char) ((x & 0x3f) | 0x80);
364         (*outbytesleft) -= 4;
365     }
366     else if (x <= 0x3ffffff && *outbytesleft >= 5)
367     {
368         *outp++ = (unsigned char) ((x >> 24) | 0xf8);
369         *outp++ = (unsigned char) (((x >> 18) & 0x3f) | 0x80);
370         *outp++ = (unsigned char) (((x >> 12) & 0x3f) | 0x80);
371         *outp++ = (unsigned char) (((x >> 6)  & 0x3f) | 0x80);
372         *outp++ = (unsigned char) ((x & 0x3f) | 0x80);
373         (*outbytesleft) -= 5;
374     }
375     else if (*outbytesleft >= 6)
376     {
377         *outp++ = (unsigned char) ((x >> 30) | 0xfc);
378         *outp++ = (unsigned char) (((x >> 24) & 0x3f) | 0x80);
379         *outp++ = (unsigned char) (((x >> 18) & 0x3f) | 0x80);
380         *outp++ = (unsigned char) (((x >> 12) & 0x3f) | 0x80);
381         *outp++ = (unsigned char) (((x >> 6)  & 0x3f) | 0x80);
382         *outp++ = (unsigned char) ((x & 0x3f) | 0x80);
383         (*outbytesleft) -= 6;
384     }
385     else 
386     {
387         cd->my_errno = YAZ_ICONV_E2BIG;  /* not room for output */
388         return (size_t)(-1);
389     }
390     *outbuf = (char *) outp;
391     return 0;
392 }
393
394 static size_t yaz_write_ISO8859_1 (yaz_iconv_t cd, unsigned long x,
395                                    char **outbuf, size_t *outbytesleft)
396 {
397     unsigned char *outp = (unsigned char *) *outbuf;
398     if (x > 255 || x < 1)
399     {
400         cd->my_errno = YAZ_ICONV_EILSEQ;
401         return (size_t) -1;
402     }
403     else if (*outbytesleft >= 1)
404     {
405         *outp++ = (unsigned char) x;
406         (*outbytesleft)--;
407     }
408     else 
409     {
410         cd->my_errno = YAZ_ICONV_E2BIG;
411         return (size_t)(-1);
412     }
413     *outbuf = (char *) outp;
414     return 0;
415 }
416
417
418 static size_t yaz_write_UCS4 (yaz_iconv_t cd, unsigned long x,
419                               char **outbuf, size_t *outbytesleft)
420 {
421     unsigned char *outp = (unsigned char *) *outbuf;
422     if (*outbytesleft >= 4)
423     {
424         *outp++ = (unsigned char) (x>>24);
425         *outp++ = (unsigned char) (x>>16);
426         *outp++ = (unsigned char) (x>>8);
427         *outp++ = (unsigned char) x;
428         (*outbytesleft) -= 4;
429     }
430     else
431     {
432         cd->my_errno = YAZ_ICONV_E2BIG;
433         return (size_t)(-1);
434     }
435     *outbuf = (char *) outp;
436     return 0;
437 }
438
439 static size_t yaz_write_UCS4LE (yaz_iconv_t cd, unsigned long x,
440                                 char **outbuf, size_t *outbytesleft)
441 {
442     unsigned char *outp = (unsigned char *) *outbuf;
443     if (*outbytesleft >= 4)
444     {
445         *outp++ = (unsigned char) x;
446         *outp++ = (unsigned char) (x>>8);
447         *outp++ = (unsigned char) (x>>16);
448         *outp++ = (unsigned char) (x>>24);
449         (*outbytesleft) -= 4;
450     }
451     else
452     {
453         cd->my_errno = YAZ_ICONV_E2BIG;
454         return (size_t)(-1);
455     }
456     *outbuf = (char *) outp;
457     return 0;
458 }
459
460 #if HAVE_WCHAR_H
461 static size_t yaz_write_wchar_t (yaz_iconv_t cd, unsigned long x,
462                                  char **outbuf, size_t *outbytesleft)
463 {
464     unsigned char *outp = (unsigned char *) *outbuf;
465
466     if (*outbytesleft >= sizeof(wchar_t))
467     {
468         wchar_t wch = x;
469         memcpy(outp, &wch, sizeof(wch));
470         outp += sizeof(wch);
471         (*outbytesleft) -= sizeof(wch);
472     }
473     else
474     {
475         cd->my_errno = YAZ_ICONV_E2BIG;
476         return (size_t)(-1);
477     }
478     *outbuf = (char *) outp;
479     return 0;
480 }
481 #endif
482
483 int yaz_iconv_isbuiltin(yaz_iconv_t cd)
484 {
485     return cd->read_handle && cd->write_handle;
486 }
487
488 yaz_iconv_t yaz_iconv_open (const char *tocode, const char *fromcode)
489 {
490     yaz_iconv_t cd = (yaz_iconv_t) xmalloc (sizeof(*cd));
491
492     cd->write_handle = 0;
493     cd->read_handle = 0;
494     cd->init_handle = 0;
495     cd->my_errno = YAZ_ICONV_UNKNOWN;
496     cd->marc8_esc_mode = 'B';
497     cd->marc8_comb_x = 0;
498
499     /* a useful hack: if fromcode has leading @,
500        the library not use YAZ's own conversions .. */
501     if (fromcode[0] == '@')
502         fromcode++;
503     else
504     {
505         if (!yaz_matchstr(fromcode, "UTF8"))
506         {
507             cd->read_handle = yaz_read_UTF8;
508             cd->init_handle = yaz_init_UTF8;
509         }
510         else if (!yaz_matchstr(fromcode, "ISO88591"))
511             cd->read_handle = yaz_read_ISO8859_1;
512         else if (!yaz_matchstr(fromcode, "UCS4"))
513             cd->read_handle = yaz_read_UCS4;
514         else if (!yaz_matchstr(fromcode, "UCS4LE"))
515             cd->read_handle = yaz_read_UCS4LE;
516         else if (!yaz_matchstr(fromcode, "MARC8"))
517             cd->read_handle = yaz_read_marc8;
518 #if HAVE_WCHAR_H
519         else if (!yaz_matchstr(fromcode, "WCHAR_T"))
520             cd->read_handle = yaz_read_wchar_t;
521 #endif
522         
523         if (!yaz_matchstr(tocode, "UTF8"))
524             cd->write_handle = yaz_write_UTF8;
525         else if (!yaz_matchstr(tocode, "ISO88591"))
526             cd->write_handle = yaz_write_ISO8859_1;
527         else if (!yaz_matchstr (tocode, "UCS4"))
528             cd->write_handle = yaz_write_UCS4;
529         else if (!yaz_matchstr(tocode, "UCS4LE"))
530             cd->write_handle = yaz_write_UCS4LE;
531 #if HAVE_WCHAR_H
532         else if (!yaz_matchstr(tocode, "WCHAR_T"))
533             cd->write_handle = yaz_write_wchar_t;
534 #endif
535     }
536 #if HAVE_ICONV_H
537     cd->iconv_cd = 0;
538     if (!cd->read_handle || !cd->write_handle)
539     {
540         cd->iconv_cd = iconv_open (tocode, fromcode);
541         if (cd->iconv_cd == (iconv_t) (-1))
542         {
543             xfree (cd);
544             return 0;
545         }
546     }
547 #else
548     if (!cd->read_handle || !cd->write_handle)
549     {
550         xfree (cd);
551         return 0;
552     }
553 #endif
554     cd->init_flag = 1;
555     return cd;
556 }
557
558 size_t yaz_iconv (yaz_iconv_t cd, char **inbuf, size_t *inbytesleft,
559                   char **outbuf, size_t *outbytesleft)
560 {
561     char *inbuf0;
562     size_t r = 0;
563 #if HAVE_ICONV_H
564     if (cd->iconv_cd)
565     {
566         size_t r =
567             iconv(cd->iconv_cd, inbuf, inbytesleft, outbuf, outbytesleft);
568         if (r == (size_t)(-1))
569         {
570             switch (yaz_errno())
571             {
572             case E2BIG:
573                 cd->my_errno = YAZ_ICONV_E2BIG;
574                 break;
575             case EINVAL:
576                 cd->my_errno = YAZ_ICONV_EINVAL;
577                 break;
578             case EILSEQ:
579                 cd->my_errno = YAZ_ICONV_EILSEQ;
580                 break;
581             default:
582                 cd->my_errno = YAZ_ICONV_UNKNOWN;
583             }
584         }
585         return r;
586     }
587 #endif
588     if (inbuf == 0 || *inbuf == 0)
589     {
590         cd->init_flag = 1;
591         cd->my_errno = YAZ_ICONV_UNKNOWN;
592         return 0;
593     }
594     inbuf0 = *inbuf;
595
596     if (cd->init_flag)
597     {
598         if (cd->init_handle)
599         {
600             size_t no_read;
601             size_t r = (cd->init_handle)(cd, (unsigned char *) *inbuf,
602                                          *inbytesleft, &no_read);
603             if (r)
604             {
605                 if (cd->my_errno == YAZ_ICONV_EINVAL)
606                     return r;
607                 cd->init_flag = 0;
608                 return r;
609             }
610             *inbytesleft -= no_read;
611             *inbuf += no_read;
612         }
613         cd->init_flag = 0;
614     }
615     while (1)
616     {
617         unsigned long x;
618         size_t no_read;
619
620         if (*inbytesleft == 0)
621         {
622             r = *inbuf - inbuf0;
623             break;
624         }
625         
626         x = (cd->read_handle)(cd, (unsigned char *) *inbuf, *inbytesleft,
627                               &no_read);
628         if (no_read == 0)
629         {
630             r = (size_t)(-1);
631             break;
632         }
633         if (x)
634         {
635             r = (cd->write_handle)(cd, x, outbuf, outbytesleft);
636             if (r)
637                 break;
638         }
639         *inbytesleft -= no_read;
640         (*inbuf) += no_read;
641     }
642     return r;
643 }
644
645 int yaz_iconv_error (yaz_iconv_t cd)
646 {
647     return cd->my_errno;
648 }
649
650 int yaz_iconv_close (yaz_iconv_t cd)
651 {
652 #if HAVE_ICONV_H
653     if (cd->iconv_cd)
654         iconv_close (cd->iconv_cd);
655 #endif
656     xfree (cd);
657     return 0;
658 }
659
660