Skip multiple of ($, in MARC-8 ESC sequence
[yaz-moved-to-github.git] / src / siconv.c
1 /*
2  * Copyright (c) 1997-2004, Index Data
3  * See the file LICENSE for details.
4  *
5  * $Id: siconv.c,v 1.4 2004-03-15 22:51:10 adam Exp $
6  */
7
8 /* mini iconv and wrapper for system iconv library (if present) */
9
10 #if HAVE_CONFIG_H
11 #include <config.h>
12 #endif
13
14 #include <errno.h>
15 #include <string.h>
16 #include <ctype.h>
17 #if HAVE_WCHAR_H
18 #include <wchar.h>
19 #endif
20
21 #if HAVE_ICONV_H
22 #include <iconv.h>
23 #endif
24
25 #include <yaz/yaz-util.h>
26
27 unsigned long yaz_marc8_conv (unsigned char *inp, size_t inbytesleft,
28                               size_t *no_read);
29     
30 unsigned long yaz_marc8_cjk_conv (unsigned char *inp, size_t inbytesleft,
31                                   size_t *no_read);
32     
33 struct yaz_iconv_struct {
34     int my_errno;
35     int init_flag;
36     size_t (*init_handle)(yaz_iconv_t cd, unsigned char *inbuf,
37                           size_t inbytesleft, size_t *no_read);
38     unsigned long (*read_handle)(yaz_iconv_t cd, unsigned char *inbuf,
39                                  size_t inbytesleft, size_t *no_read);
40     size_t (*write_handle)(yaz_iconv_t cd, unsigned long x,
41                            char **outbuf, size_t *outbytesleft);
42     int marc8_esc_mode;
43 #if HAVE_ICONV_H
44     iconv_t iconv_cd;
45 #endif
46 };
47
48 static unsigned long yaz_read_ISO8859_1 (yaz_iconv_t cd, unsigned char *inp,
49                                          size_t inbytesleft, size_t *no_read)
50 {
51     unsigned long x = inp[0];
52     *no_read = 1;
53     return x;
54 }
55
56 static size_t yaz_init_UTF8 (yaz_iconv_t cd, unsigned char *inp,
57                              size_t inbytesleft, size_t *no_read)
58 {
59     if (inp[0] != 0xef)
60     {
61         *no_read = 0;
62         return 0;
63     }
64     if (inbytesleft < 3)
65     {
66         cd->my_errno = YAZ_ICONV_EINVAL;
67         return (size_t) -1;
68     }
69     if (inp[1] != 0xbb || inp[2] != 0xbf)
70     {
71         cd->my_errno = YAZ_ICONV_EILSEQ;
72         return (size_t) -1;
73     }
74     *no_read = 3;
75     return 0;
76 }
77
78 static unsigned long yaz_read_UTF8 (yaz_iconv_t cd, unsigned char *inp,
79                                     size_t inbytesleft, size_t *no_read)
80 {
81     unsigned long x = 0;
82
83     if (inp[0] <= 0x7f)
84     {
85         x = inp[0];
86         *no_read = 1;
87     }
88     else if (inp[0] <= 0xbf || inp[0] >= 0xfe)
89     {
90         *no_read = 0;
91         cd->my_errno = YAZ_ICONV_EILSEQ;
92     }
93     else if (inp[0] <= 0xdf && inbytesleft >= 2)
94     {
95         x = ((inp[0] & 0x1f) << 6) | (inp[1] & 0x3f);
96         if (x >= 0x80)
97             *no_read = 2;
98         else
99         {
100             *no_read = 0;
101             cd->my_errno = YAZ_ICONV_EILSEQ;
102         }
103     }
104     else if (inp[0] <= 0xef && inbytesleft >= 3)
105     {
106         x = ((inp[0] & 0x0f) << 12) | ((inp[1] & 0x3f) << 6) |
107             (inp[1] & 0x3f);
108         if (x >= 0x800)
109             *no_read = 3;
110         else
111         {
112             *no_read = 0;
113             cd->my_errno = YAZ_ICONV_EILSEQ;
114         }
115     }
116     else if (inp[0] <= 0xf7 && inbytesleft >= 4)
117     {
118         x =  ((inp[0] & 0x07) << 18) | ((inp[1] & 0x3f) << 12) |
119             ((inp[2] & 0x3f) << 6) | (inp[3] & 0x3f);
120         if (x >= 0x10000)
121             *no_read = 4;
122         else
123         {
124             *no_read = 0;
125             cd->my_errno = YAZ_ICONV_EILSEQ;
126         }
127     }
128     else if (inp[0] <= 0xfb && inbytesleft >= 5)
129     {
130         x =  ((inp[0] & 0x03) << 24) | ((inp[1] & 0x3f) << 18) |
131             ((inp[2] & 0x3f) << 12) | ((inp[3] & 0x3f) << 6) |
132             (inp[4] & 0x3f);
133         if (x >= 0x200000)
134             *no_read = 5;
135         else
136         {
137             *no_read = 0;
138             cd->my_errno = YAZ_ICONV_EILSEQ;
139         }
140     }
141     else if (inp[0] <= 0xfd && inbytesleft >= 6)
142     {
143         x =  ((inp[0] & 0x01) << 30) | ((inp[1] & 0x3f) << 24) |
144             ((inp[2] & 0x3f) << 18) | ((inp[3] & 0x3f) << 12) |
145             ((inp[4] & 0x3f) << 6) | (inp[5] & 0x3f);
146         if (x >= 0x4000000)
147             *no_read = 6;
148         else
149         {
150             *no_read = 0;
151             cd->my_errno = YAZ_ICONV_EILSEQ;
152         }
153     }
154     else
155     {
156         *no_read = 0;
157         cd->my_errno = YAZ_ICONV_EINVAL;
158     }
159     return x;
160 }
161
162 static unsigned long yaz_read_UCS4 (yaz_iconv_t cd, unsigned char *inp,
163                                     size_t inbytesleft, size_t *no_read)
164 {
165     unsigned long x = 0;
166     
167     if (inbytesleft < 4)
168     {
169         cd->my_errno = YAZ_ICONV_EINVAL; /* incomplete input */
170         *no_read = 0;
171     }
172     else
173     {
174         x = (inp[0]<<24) | (inp[1]<<16) | (inp[2]<<8) | inp[3];
175         *no_read = 4;
176     }
177     return x;
178 }
179
180 static unsigned long yaz_read_UCS4LE (yaz_iconv_t cd, unsigned char *inp,
181                                       size_t inbytesleft, size_t *no_read)
182 {
183     unsigned long x = 0;
184     
185     if (inbytesleft < 4)
186     {
187         cd->my_errno = YAZ_ICONV_EINVAL; /* incomplete input */
188         *no_read = 0;
189     }
190     else
191     {
192         x = (inp[3]<<24) | (inp[2]<<16) | (inp[1]<<8) | inp[0];
193         *no_read = 4;
194     }
195     return x;
196 }
197
198 #if HAVE_WCHAR_H
199 static unsigned long yaz_read_wchar_t (yaz_iconv_t cd, unsigned char *inp,
200                                        size_t inbytesleft, size_t *no_read)
201 {
202     unsigned long x = 0;
203     
204     if (inbytesleft < sizeof(wchar_t))
205     {
206         cd->my_errno = YAZ_ICONV_EINVAL; /* incomplete input */
207         *no_read = 0;
208     }
209     else
210     {
211         wchar_t wch;
212         memcpy (&wch, inp, sizeof(wch));
213         x = wch;
214         *no_read = sizeof(wch);
215     }
216     return x;
217 }
218 #endif
219
220 static unsigned long yaz_read_marc8 (yaz_iconv_t cd, unsigned char *inp,
221                                      size_t inbytesleft, size_t *no_read)
222 {
223     *no_read = 0;
224     while(inbytesleft >= 1 && inp[0] == 27)
225     {
226         size_t inbytesleft0 = inbytesleft;
227         inp++;
228         inbytesleft--;
229         while(inbytesleft > 0 && strchr("(,$", *inp))
230         {
231             inbytesleft--;
232             inp++;
233         }
234         if (inbytesleft <= 0)
235         {
236             *no_read = 0;
237             cd->my_errno = YAZ_ICONV_EINVAL;
238             return 0;
239         }
240         if (*inp == '!')
241         {
242             if (inbytesleft <= 1)
243             {
244                 *no_read = 0;
245                 cd->my_errno = YAZ_ICONV_EINVAL;
246                 return 0;
247             }
248             inbytesleft--;
249             inp++;
250         }
251         cd->marc8_esc_mode = *inp++;
252         inbytesleft--;
253         (*no_read) += inbytesleft0 - inbytesleft;
254     }
255     if (inbytesleft <= 0)
256         return 0;
257     else
258     {
259         unsigned long x;
260         size_t no_read_sub = 0;
261
262         switch(cd->marc8_esc_mode)
263         {
264         case 'B':
265         case 'E':
266             x = yaz_marc8_conv(inp, inbytesleft, &no_read_sub);
267             *no_read += no_read_sub;
268             return x;
269         case '1':
270             x = yaz_marc8_cjk_conv(inp, inbytesleft, &no_read_sub);
271             *no_read += no_read_sub;
272             return x;
273         default:
274             *no_read = 0;
275             cd->my_errno = YAZ_ICONV_EILSEQ;
276             return 0;
277         }
278     }
279 }
280
281 static size_t yaz_write_UTF8 (yaz_iconv_t cd, unsigned long x,
282                               char **outbuf, size_t *outbytesleft)
283 {
284     unsigned char *outp = (unsigned char *) *outbuf;
285     if (x <= 0x7f && *outbytesleft >= 1)
286     {
287         *outp++ = (unsigned char) x;
288         (*outbytesleft)--;
289     } 
290     else if (x <= 0x7ff && *outbytesleft >= 2)
291     {
292         *outp++ = (unsigned char) ((x >> 6) | 0xc0);
293         *outp++ = (unsigned char) ((x & 0x3f) | 0x80);
294         (*outbytesleft) -= 2;
295     }
296     else if (x <= 0xffff && *outbytesleft >= 3)
297     {
298         *outp++ = (unsigned char) ((x >> 12) | 0xe0);
299         *outp++ = (unsigned char) (((x >> 6) & 0x3f) | 0x80);
300         *outp++ = (unsigned char) ((x & 0x3f) | 0x80);
301         (*outbytesleft) -= 3;
302     }
303     else if (x <= 0x1fffff && *outbytesleft >= 4)
304     {
305         *outp++ = (unsigned char) ((x >> 18) | 0xf0);
306         *outp++ = (unsigned char) (((x >> 12) & 0x3f) | 0x80);
307         *outp++ = (unsigned char) (((x >> 6)  & 0x3f) | 0x80);
308         *outp++ = (unsigned char) ((x & 0x3f) | 0x80);
309         (*outbytesleft) -= 4;
310     }
311     else if (x <= 0x3ffffff && *outbytesleft >= 5)
312     {
313         *outp++ = (unsigned char) ((x >> 24) | 0xf8);
314         *outp++ = (unsigned char) (((x >> 18) & 0x3f) | 0x80);
315         *outp++ = (unsigned char) (((x >> 12) & 0x3f) | 0x80);
316         *outp++ = (unsigned char) (((x >> 6)  & 0x3f) | 0x80);
317         *outp++ = (unsigned char) ((x & 0x3f) | 0x80);
318         (*outbytesleft) -= 5;
319     }
320     else if (*outbytesleft >= 6)
321     {
322         *outp++ = (unsigned char) ((x >> 30) | 0xfc);
323         *outp++ = (unsigned char) (((x >> 24) & 0x3f) | 0x80);
324         *outp++ = (unsigned char) (((x >> 18) & 0x3f) | 0x80);
325         *outp++ = (unsigned char) (((x >> 12) & 0x3f) | 0x80);
326         *outp++ = (unsigned char) (((x >> 6)  & 0x3f) | 0x80);
327         *outp++ = (unsigned char) ((x & 0x3f) | 0x80);
328         (*outbytesleft) -= 6;
329     }
330     else 
331     {
332         cd->my_errno = YAZ_ICONV_E2BIG;  /* not room for output */
333         return (size_t)(-1);
334     }
335     *outbuf = (char *) outp;
336     return 0;
337 }
338
339 static size_t yaz_write_ISO8859_1 (yaz_iconv_t cd, unsigned long x,
340                                    char **outbuf, size_t *outbytesleft)
341 {
342     unsigned char *outp = (unsigned char *) *outbuf;
343     if (x > 255 || x < 1)
344     {
345         cd->my_errno = YAZ_ICONV_EILSEQ;
346         return (size_t) -1;
347     }
348     else if (*outbytesleft >= 1)
349     {
350         *outp++ = (unsigned char) x;
351         (*outbytesleft)--;
352     }
353     else 
354     {
355         cd->my_errno = YAZ_ICONV_E2BIG;
356         return (size_t)(-1);
357     }
358     *outbuf = (char *) outp;
359     return 0;
360 }
361
362
363 static size_t yaz_write_UCS4 (yaz_iconv_t cd, unsigned long x,
364                               char **outbuf, size_t *outbytesleft)
365 {
366     unsigned char *outp = (unsigned char *) *outbuf;
367     if (*outbytesleft >= 4)
368     {
369         *outp++ = (unsigned char) (x>>24);
370         *outp++ = (unsigned char) (x>>16);
371         *outp++ = (unsigned char) (x>>8);
372         *outp++ = (unsigned char) x;
373         (*outbytesleft) -= 4;
374     }
375     else
376     {
377         cd->my_errno = YAZ_ICONV_E2BIG;
378         return (size_t)(-1);
379     }
380     *outbuf = (char *) outp;
381     return 0;
382 }
383
384 static size_t yaz_write_UCS4LE (yaz_iconv_t cd, unsigned long x,
385                                 char **outbuf, size_t *outbytesleft)
386 {
387     unsigned char *outp = (unsigned char *) *outbuf;
388     if (*outbytesleft >= 4)
389     {
390         *outp++ = (unsigned char) x;
391         *outp++ = (unsigned char) (x>>8);
392         *outp++ = (unsigned char) (x>>16);
393         *outp++ = (unsigned char) (x>>24);
394         (*outbytesleft) -= 4;
395     }
396     else
397     {
398         cd->my_errno = YAZ_ICONV_E2BIG;
399         return (size_t)(-1);
400     }
401     *outbuf = (char *) outp;
402     return 0;
403 }
404
405 #if HAVE_WCHAR_H
406 static size_t yaz_write_wchar_t (yaz_iconv_t cd, unsigned long x,
407                                  char **outbuf, size_t *outbytesleft)
408 {
409     unsigned char *outp = (unsigned char *) *outbuf;
410
411     if (*outbytesleft >= sizeof(wchar_t))
412     {
413         wchar_t wch = x;
414         memcpy(outp, &wch, sizeof(wch));
415         outp += sizeof(wch);
416         (*outbytesleft) -= sizeof(wch);
417     }
418     else
419     {
420         cd->my_errno = YAZ_ICONV_E2BIG;
421         return (size_t)(-1);
422     }
423     *outbuf = (char *) outp;
424     return 0;
425 }
426 #endif
427
428 int yaz_iconv_isbuiltin(yaz_iconv_t cd)
429 {
430     return cd->read_handle && cd->write_handle;
431 }
432
433 yaz_iconv_t yaz_iconv_open (const char *tocode, const char *fromcode)
434 {
435     yaz_iconv_t cd = (yaz_iconv_t) xmalloc (sizeof(*cd));
436
437     cd->write_handle = 0;
438     cd->read_handle = 0;
439     cd->init_handle = 0;
440     cd->my_errno = YAZ_ICONV_UNKNOWN;
441     cd->marc8_esc_mode = 'B';
442
443     /* a useful hack: if fromcode has leading @,
444        the library not use YAZ's own conversions .. */
445     if (fromcode[0] == '@')
446         fromcode++;
447     else
448     {
449         if (!yaz_matchstr(fromcode, "UTF8"))
450         {
451             cd->read_handle = yaz_read_UTF8;
452             cd->init_handle = yaz_init_UTF8;
453         }
454         else if (!yaz_matchstr(fromcode, "ISO88591"))
455             cd->read_handle = yaz_read_ISO8859_1;
456         else if (!yaz_matchstr(fromcode, "UCS4"))
457             cd->read_handle = yaz_read_UCS4;
458         else if (!yaz_matchstr(fromcode, "UCS4LE"))
459             cd->read_handle = yaz_read_UCS4LE;
460         else if (!yaz_matchstr(fromcode, "MARC8"))
461             cd->read_handle = yaz_read_marc8;
462 #if HAVE_WCHAR_H
463         else if (!yaz_matchstr(fromcode, "WCHAR_T"))
464             cd->read_handle = yaz_read_wchar_t;
465 #endif
466         
467         if (!yaz_matchstr(tocode, "UTF8"))
468             cd->write_handle = yaz_write_UTF8;
469         else if (!yaz_matchstr(tocode, "ISO88591"))
470             cd->write_handle = yaz_write_ISO8859_1;
471         else if (!yaz_matchstr (tocode, "UCS4"))
472             cd->write_handle = yaz_write_UCS4;
473         else if (!yaz_matchstr(tocode, "UCS4LE"))
474             cd->write_handle = yaz_write_UCS4LE;
475 #if HAVE_WCHAR_H
476         else if (!yaz_matchstr(tocode, "WCHAR_T"))
477             cd->write_handle = yaz_write_wchar_t;
478 #endif
479     }
480 #if HAVE_ICONV_H
481     cd->iconv_cd = 0;
482     if (!cd->read_handle || !cd->write_handle)
483     {
484         cd->iconv_cd = iconv_open (tocode, fromcode);
485         if (cd->iconv_cd == (iconv_t) (-1))
486         {
487             xfree (cd);
488             return 0;
489         }
490     }
491 #else
492     if (!cd->read_handle || !cd->write_handle)
493     {
494         xfree (cd);
495         return 0;
496     }
497 #endif
498     cd->init_flag = 1;
499     return cd;
500 }
501
502 size_t yaz_iconv (yaz_iconv_t cd, char **inbuf, size_t *inbytesleft,
503                   char **outbuf, size_t *outbytesleft)
504 {
505     char *inbuf0;
506     size_t r = 0;
507 #if HAVE_ICONV_H
508     if (cd->iconv_cd)
509     {
510         size_t r =
511             iconv(cd->iconv_cd, inbuf, inbytesleft, outbuf, outbytesleft);
512         if (r == (size_t)(-1))
513         {
514             switch (yaz_errno())
515             {
516             case E2BIG:
517                 cd->my_errno = YAZ_ICONV_E2BIG;
518                 break;
519             case EINVAL:
520                 cd->my_errno = YAZ_ICONV_EINVAL;
521                 break;
522             case EILSEQ:
523                 cd->my_errno = YAZ_ICONV_EILSEQ;
524                 break;
525             default:
526                 cd->my_errno = YAZ_ICONV_UNKNOWN;
527             }
528         }
529         return r;
530     }
531 #endif
532     if (inbuf == 0 || *inbuf == 0)
533     {
534         cd->init_flag = 1;
535         cd->my_errno = YAZ_ICONV_UNKNOWN;
536         return 0;
537     }
538     inbuf0 = *inbuf;
539
540     if (cd->init_flag)
541     {
542         if (cd->init_handle)
543         {
544             size_t no_read;
545             size_t r = (cd->init_handle)(cd, (unsigned char *) *inbuf,
546                                          *inbytesleft, &no_read);
547             if (r)
548             {
549                 if (cd->my_errno == YAZ_ICONV_EINVAL)
550                     return r;
551                 cd->init_flag = 0;
552                 return r;
553             }
554             *inbytesleft -= no_read;
555             *inbuf += no_read;
556         }
557         cd->init_flag = 0;
558     }
559     while (1)
560     {
561         unsigned long x;
562         size_t no_read;
563
564         if (*inbytesleft == 0)
565         {
566             r = *inbuf - inbuf0;
567             break;
568         }
569         
570         x = (cd->read_handle)(cd, (unsigned char *) *inbuf, *inbytesleft,
571                               &no_read);
572         if (no_read == 0)
573         {
574             r = (size_t)(-1);
575             break;
576         }
577         if (x)
578         {
579             r = (cd->write_handle)(cd, x, outbuf, outbytesleft);
580             if (r)
581                 break;
582         }
583         *inbytesleft -= no_read;
584         (*inbuf) += no_read;
585     }
586     return r;
587 }
588
589 int yaz_iconv_error (yaz_iconv_t cd)
590 {
591     return cd->my_errno;
592 }
593
594 int yaz_iconv_close (yaz_iconv_t cd)
595 {
596 #if HAVE_ICONV_H
597     if (cd->iconv_cd)
598         iconv_close (cd->iconv_cd);
599 #endif
600     xfree (cd);
601     return 0;
602 }
603
604