Extend MARC-8 to handle ESC-G0 EACC. Fix conversion order for MARCXML
[yaz-moved-to-github.git] / src / siconv.c
1 /*
2  * Copyright (c) 1997-2004, Index Data
3  * See the file LICENSE for details.
4  *
5  * $Id: siconv.c,v 1.3 2004-03-15 21:39:06 adam Exp $
6  */
7
8 /* mini iconv and wrapper for system iconv library (if present) */
9
10 #if HAVE_CONFIG_H
11 #include <config.h>
12 #endif
13
14 #include <errno.h>
15 #include <string.h>
16 #include <ctype.h>
17 #if HAVE_WCHAR_H
18 #include <wchar.h>
19 #endif
20
21 #if HAVE_ICONV_H
22 #include <iconv.h>
23 #endif
24
25 #include <yaz/yaz-util.h>
26
27 unsigned long yaz_marc8_conv (unsigned char *inp, size_t inbytesleft,
28                               size_t *no_read);
29     
30 unsigned long yaz_marc8_cjk_conv (unsigned char *inp, size_t inbytesleft,
31                                   size_t *no_read);
32     
33 struct yaz_iconv_struct {
34     int my_errno;
35     int init_flag;
36     size_t (*init_handle)(yaz_iconv_t cd, unsigned char *inbuf,
37                           size_t inbytesleft, size_t *no_read);
38     unsigned long (*read_handle)(yaz_iconv_t cd, unsigned char *inbuf,
39                                  size_t inbytesleft, size_t *no_read);
40     size_t (*write_handle)(yaz_iconv_t cd, unsigned long x,
41                            char **outbuf, size_t *outbytesleft);
42     int marc8_esc_mode;
43 #if HAVE_ICONV_H
44     iconv_t iconv_cd;
45 #endif
46 };
47
48 static unsigned long yaz_read_ISO8859_1 (yaz_iconv_t cd, unsigned char *inp,
49                                          size_t inbytesleft, size_t *no_read)
50 {
51     unsigned long x = inp[0];
52     *no_read = 1;
53     return x;
54 }
55
56 static size_t yaz_init_UTF8 (yaz_iconv_t cd, unsigned char *inp,
57                              size_t inbytesleft, size_t *no_read)
58 {
59     if (inp[0] != 0xef)
60     {
61         *no_read = 0;
62         return 0;
63     }
64     if (inbytesleft < 3)
65     {
66         cd->my_errno = YAZ_ICONV_EINVAL;
67         return (size_t) -1;
68     }
69     if (inp[1] != 0xbb || inp[2] != 0xbf)
70     {
71         cd->my_errno = YAZ_ICONV_EILSEQ;
72         return (size_t) -1;
73     }
74     *no_read = 3;
75     return 0;
76 }
77
78 static unsigned long yaz_read_UTF8 (yaz_iconv_t cd, unsigned char *inp,
79                                     size_t inbytesleft, size_t *no_read)
80 {
81     unsigned long x = 0;
82
83     if (inp[0] <= 0x7f)
84     {
85         x = inp[0];
86         *no_read = 1;
87     }
88     else if (inp[0] <= 0xbf || inp[0] >= 0xfe)
89     {
90         *no_read = 0;
91         cd->my_errno = YAZ_ICONV_EILSEQ;
92     }
93     else if (inp[0] <= 0xdf && inbytesleft >= 2)
94     {
95         x = ((inp[0] & 0x1f) << 6) | (inp[1] & 0x3f);
96         if (x >= 0x80)
97             *no_read = 2;
98         else
99         {
100             *no_read = 0;
101             cd->my_errno = YAZ_ICONV_EILSEQ;
102         }
103     }
104     else if (inp[0] <= 0xef && inbytesleft >= 3)
105     {
106         x = ((inp[0] & 0x0f) << 12) | ((inp[1] & 0x3f) << 6) |
107             (inp[1] & 0x3f);
108         if (x >= 0x800)
109             *no_read = 3;
110         else
111         {
112             *no_read = 0;
113             cd->my_errno = YAZ_ICONV_EILSEQ;
114         }
115     }
116     else if (inp[0] <= 0xf7 && inbytesleft >= 4)
117     {
118         x =  ((inp[0] & 0x07) << 18) | ((inp[1] & 0x3f) << 12) |
119             ((inp[2] & 0x3f) << 6) | (inp[3] & 0x3f);
120         if (x >= 0x10000)
121             *no_read = 4;
122         else
123         {
124             *no_read = 0;
125             cd->my_errno = YAZ_ICONV_EILSEQ;
126         }
127     }
128     else if (inp[0] <= 0xfb && inbytesleft >= 5)
129     {
130         x =  ((inp[0] & 0x03) << 24) | ((inp[1] & 0x3f) << 18) |
131             ((inp[2] & 0x3f) << 12) | ((inp[3] & 0x3f) << 6) |
132             (inp[4] & 0x3f);
133         if (x >= 0x200000)
134             *no_read = 5;
135         else
136         {
137             *no_read = 0;
138             cd->my_errno = YAZ_ICONV_EILSEQ;
139         }
140     }
141     else if (inp[0] <= 0xfd && inbytesleft >= 6)
142     {
143         x =  ((inp[0] & 0x01) << 30) | ((inp[1] & 0x3f) << 24) |
144             ((inp[2] & 0x3f) << 18) | ((inp[3] & 0x3f) << 12) |
145             ((inp[4] & 0x3f) << 6) | (inp[5] & 0x3f);
146         if (x >= 0x4000000)
147             *no_read = 6;
148         else
149         {
150             *no_read = 0;
151             cd->my_errno = YAZ_ICONV_EILSEQ;
152         }
153     }
154     else
155     {
156         *no_read = 0;
157         cd->my_errno = YAZ_ICONV_EINVAL;
158     }
159     return x;
160 }
161
162 static unsigned long yaz_read_UCS4 (yaz_iconv_t cd, unsigned char *inp,
163                                     size_t inbytesleft, size_t *no_read)
164 {
165     unsigned long x = 0;
166     
167     if (inbytesleft < 4)
168     {
169         cd->my_errno = YAZ_ICONV_EINVAL; /* incomplete input */
170         *no_read = 0;
171     }
172     else
173     {
174         x = (inp[0]<<24) | (inp[1]<<16) | (inp[2]<<8) | inp[3];
175         *no_read = 4;
176     }
177     return x;
178 }
179
180 static unsigned long yaz_read_UCS4LE (yaz_iconv_t cd, unsigned char *inp,
181                                       size_t inbytesleft, size_t *no_read)
182 {
183     unsigned long x = 0;
184     
185     if (inbytesleft < 4)
186     {
187         cd->my_errno = YAZ_ICONV_EINVAL; /* incomplete input */
188         *no_read = 0;
189     }
190     else
191     {
192         x = (inp[3]<<24) | (inp[2]<<16) | (inp[1]<<8) | inp[0];
193         *no_read = 4;
194     }
195     return x;
196 }
197
198 #if HAVE_WCHAR_H
199 static unsigned long yaz_read_wchar_t (yaz_iconv_t cd, unsigned char *inp,
200                                        size_t inbytesleft, size_t *no_read)
201 {
202     unsigned long x = 0;
203     
204     if (inbytesleft < sizeof(wchar_t))
205     {
206         cd->my_errno = YAZ_ICONV_EINVAL; /* incomplete input */
207         *no_read = 0;
208     }
209     else
210     {
211         wchar_t wch;
212         memcpy (&wch, inp, sizeof(wch));
213         x = wch;
214         *no_read = sizeof(wch);
215     }
216     return x;
217 }
218 #endif
219
220 static unsigned long yaz_read_marc8 (yaz_iconv_t cd, unsigned char *inp,
221                                      size_t inbytesleft, size_t *no_read)
222 {
223     *no_read = 0;
224     while(inbytesleft >= 1 && inp[0] == 27)
225     {
226         size_t inbytesleft0 = inbytesleft;
227         inp++;
228         inbytesleft--;
229         if (inbytesleft <= 1)
230         {
231             *no_read = 0;
232             cd->my_errno = YAZ_ICONV_EINVAL;
233             return 0;
234         }
235         if (*inp == '(' || *inp == ',') /* GO, one bytes */
236         {
237             inbytesleft--;
238             inp++;
239         }
240         else if (*inp == '$') /* G0, multi byte */
241         {
242             inbytesleft--;
243             inp++;
244             if (inp[0] == ',')
245             {
246                 inbytesleft--;
247                 inp++;
248             }
249         }
250         if (inbytesleft <= 0)
251         {
252             *no_read = 0;
253             cd->my_errno = YAZ_ICONV_EINVAL;
254             return 0;
255         }
256         if (*inp == '!')
257         {
258             if (inbytesleft <= 1)
259             {
260                 *no_read = 0;
261                 cd->my_errno = YAZ_ICONV_EINVAL;
262                 return 0;
263             }
264             inbytesleft--;
265             inp++;
266         }
267         cd->marc8_esc_mode = *inp++;
268         inbytesleft--;
269         (*no_read) += inbytesleft0 - inbytesleft;
270     }
271     if (inbytesleft <= 0)
272         return 0;
273     else
274     {
275         unsigned long x;
276         size_t no_read_sub = 0;
277
278         switch(cd->marc8_esc_mode)
279         {
280         case 'B':
281         case 'E':
282             x = yaz_marc8_conv(inp, inbytesleft, &no_read_sub);
283             *no_read += no_read_sub;
284             return x;
285         case '1':
286             x = yaz_marc8_cjk_conv(inp, inbytesleft, &no_read_sub);
287             *no_read += no_read_sub;
288             return x;
289         default:
290             *no_read = 0;
291             cd->my_errno = YAZ_ICONV_EILSEQ;
292             return 0;
293         }
294     }
295 }
296
297 static size_t yaz_write_UTF8 (yaz_iconv_t cd, unsigned long x,
298                               char **outbuf, size_t *outbytesleft)
299 {
300     unsigned char *outp = (unsigned char *) *outbuf;
301     if (x <= 0x7f && *outbytesleft >= 1)
302     {
303         *outp++ = (unsigned char) x;
304         (*outbytesleft)--;
305     } 
306     else if (x <= 0x7ff && *outbytesleft >= 2)
307     {
308         *outp++ = (unsigned char) ((x >> 6) | 0xc0);
309         *outp++ = (unsigned char) ((x & 0x3f) | 0x80);
310         (*outbytesleft) -= 2;
311     }
312     else if (x <= 0xffff && *outbytesleft >= 3)
313     {
314         *outp++ = (unsigned char) ((x >> 12) | 0xe0);
315         *outp++ = (unsigned char) (((x >> 6) & 0x3f) | 0x80);
316         *outp++ = (unsigned char) ((x & 0x3f) | 0x80);
317         (*outbytesleft) -= 3;
318     }
319     else if (x <= 0x1fffff && *outbytesleft >= 4)
320     {
321         *outp++ = (unsigned char) ((x >> 18) | 0xf0);
322         *outp++ = (unsigned char) (((x >> 12) & 0x3f) | 0x80);
323         *outp++ = (unsigned char) (((x >> 6)  & 0x3f) | 0x80);
324         *outp++ = (unsigned char) ((x & 0x3f) | 0x80);
325         (*outbytesleft) -= 4;
326     }
327     else if (x <= 0x3ffffff && *outbytesleft >= 5)
328     {
329         *outp++ = (unsigned char) ((x >> 24) | 0xf8);
330         *outp++ = (unsigned char) (((x >> 18) & 0x3f) | 0x80);
331         *outp++ = (unsigned char) (((x >> 12) & 0x3f) | 0x80);
332         *outp++ = (unsigned char) (((x >> 6)  & 0x3f) | 0x80);
333         *outp++ = (unsigned char) ((x & 0x3f) | 0x80);
334         (*outbytesleft) -= 5;
335     }
336     else if (*outbytesleft >= 6)
337     {
338         *outp++ = (unsigned char) ((x >> 30) | 0xfc);
339         *outp++ = (unsigned char) (((x >> 24) & 0x3f) | 0x80);
340         *outp++ = (unsigned char) (((x >> 18) & 0x3f) | 0x80);
341         *outp++ = (unsigned char) (((x >> 12) & 0x3f) | 0x80);
342         *outp++ = (unsigned char) (((x >> 6)  & 0x3f) | 0x80);
343         *outp++ = (unsigned char) ((x & 0x3f) | 0x80);
344         (*outbytesleft) -= 6;
345     }
346     else 
347     {
348         cd->my_errno = YAZ_ICONV_E2BIG;  /* not room for output */
349         return (size_t)(-1);
350     }
351     *outbuf = (char *) outp;
352     return 0;
353 }
354
355 static size_t yaz_write_ISO8859_1 (yaz_iconv_t cd, unsigned long x,
356                                    char **outbuf, size_t *outbytesleft)
357 {
358     unsigned char *outp = (unsigned char *) *outbuf;
359     if (x > 255 || x < 1)
360     {
361         cd->my_errno = YAZ_ICONV_EILSEQ;
362         return (size_t) -1;
363     }
364     else if (*outbytesleft >= 1)
365     {
366         *outp++ = (unsigned char) x;
367         (*outbytesleft)--;
368     }
369     else 
370     {
371         cd->my_errno = YAZ_ICONV_E2BIG;
372         return (size_t)(-1);
373     }
374     *outbuf = (char *) outp;
375     return 0;
376 }
377
378
379 static size_t yaz_write_UCS4 (yaz_iconv_t cd, unsigned long x,
380                               char **outbuf, size_t *outbytesleft)
381 {
382     unsigned char *outp = (unsigned char *) *outbuf;
383     if (*outbytesleft >= 4)
384     {
385         *outp++ = (unsigned char) (x>>24);
386         *outp++ = (unsigned char) (x>>16);
387         *outp++ = (unsigned char) (x>>8);
388         *outp++ = (unsigned char) x;
389         (*outbytesleft) -= 4;
390     }
391     else
392     {
393         cd->my_errno = YAZ_ICONV_E2BIG;
394         return (size_t)(-1);
395     }
396     *outbuf = (char *) outp;
397     return 0;
398 }
399
400 static size_t yaz_write_UCS4LE (yaz_iconv_t cd, unsigned long x,
401                                 char **outbuf, size_t *outbytesleft)
402 {
403     unsigned char *outp = (unsigned char *) *outbuf;
404     if (*outbytesleft >= 4)
405     {
406         *outp++ = (unsigned char) x;
407         *outp++ = (unsigned char) (x>>8);
408         *outp++ = (unsigned char) (x>>16);
409         *outp++ = (unsigned char) (x>>24);
410         (*outbytesleft) -= 4;
411     }
412     else
413     {
414         cd->my_errno = YAZ_ICONV_E2BIG;
415         return (size_t)(-1);
416     }
417     *outbuf = (char *) outp;
418     return 0;
419 }
420
421 #if HAVE_WCHAR_H
422 static size_t yaz_write_wchar_t (yaz_iconv_t cd, unsigned long x,
423                                  char **outbuf, size_t *outbytesleft)
424 {
425     unsigned char *outp = (unsigned char *) *outbuf;
426
427     if (*outbytesleft >= sizeof(wchar_t))
428     {
429         wchar_t wch = x;
430         memcpy(outp, &wch, sizeof(wch));
431         outp += sizeof(wch);
432         (*outbytesleft) -= sizeof(wch);
433     }
434     else
435     {
436         cd->my_errno = YAZ_ICONV_E2BIG;
437         return (size_t)(-1);
438     }
439     *outbuf = (char *) outp;
440     return 0;
441 }
442 #endif
443
444 int yaz_iconv_isbuiltin(yaz_iconv_t cd)
445 {
446     return cd->read_handle && cd->write_handle;
447 }
448
449 yaz_iconv_t yaz_iconv_open (const char *tocode, const char *fromcode)
450 {
451     yaz_iconv_t cd = (yaz_iconv_t) xmalloc (sizeof(*cd));
452
453     cd->write_handle = 0;
454     cd->read_handle = 0;
455     cd->init_handle = 0;
456     cd->my_errno = YAZ_ICONV_UNKNOWN;
457     cd->marc8_esc_mode = 'B';
458
459     /* a useful hack: if fromcode has leading @,
460        the library not use YAZ's own conversions .. */
461     if (fromcode[0] == '@')
462         fromcode++;
463     else
464     {
465         if (!yaz_matchstr(fromcode, "UTF8"))
466         {
467             cd->read_handle = yaz_read_UTF8;
468             cd->init_handle = yaz_init_UTF8;
469         }
470         else if (!yaz_matchstr(fromcode, "ISO88591"))
471             cd->read_handle = yaz_read_ISO8859_1;
472         else if (!yaz_matchstr(fromcode, "UCS4"))
473             cd->read_handle = yaz_read_UCS4;
474         else if (!yaz_matchstr(fromcode, "UCS4LE"))
475             cd->read_handle = yaz_read_UCS4LE;
476         else if (!yaz_matchstr(fromcode, "MARC8"))
477             cd->read_handle = yaz_read_marc8;
478 #if HAVE_WCHAR_H
479         else if (!yaz_matchstr(fromcode, "WCHAR_T"))
480             cd->read_handle = yaz_read_wchar_t;
481 #endif
482         
483         if (!yaz_matchstr(tocode, "UTF8"))
484             cd->write_handle = yaz_write_UTF8;
485         else if (!yaz_matchstr(tocode, "ISO88591"))
486             cd->write_handle = yaz_write_ISO8859_1;
487         else if (!yaz_matchstr (tocode, "UCS4"))
488             cd->write_handle = yaz_write_UCS4;
489         else if (!yaz_matchstr(tocode, "UCS4LE"))
490             cd->write_handle = yaz_write_UCS4LE;
491 #if HAVE_WCHAR_H
492         else if (!yaz_matchstr(tocode, "WCHAR_T"))
493             cd->write_handle = yaz_write_wchar_t;
494 #endif
495     }
496 #if HAVE_ICONV_H
497     cd->iconv_cd = 0;
498     if (!cd->read_handle || !cd->write_handle)
499     {
500         cd->iconv_cd = iconv_open (tocode, fromcode);
501         if (cd->iconv_cd == (iconv_t) (-1))
502         {
503             xfree (cd);
504             return 0;
505         }
506     }
507 #else
508     if (!cd->read_handle || !cd->write_handle)
509     {
510         xfree (cd);
511         return 0;
512     }
513 #endif
514     cd->init_flag = 1;
515     return cd;
516 }
517
518 size_t yaz_iconv (yaz_iconv_t cd, char **inbuf, size_t *inbytesleft,
519                   char **outbuf, size_t *outbytesleft)
520 {
521     char *inbuf0;
522     size_t r = 0;
523 #if HAVE_ICONV_H
524     if (cd->iconv_cd)
525     {
526         size_t r =
527             iconv(cd->iconv_cd, inbuf, inbytesleft, outbuf, outbytesleft);
528         if (r == (size_t)(-1))
529         {
530             switch (yaz_errno())
531             {
532             case E2BIG:
533                 cd->my_errno = YAZ_ICONV_E2BIG;
534                 break;
535             case EINVAL:
536                 cd->my_errno = YAZ_ICONV_EINVAL;
537                 break;
538             case EILSEQ:
539                 cd->my_errno = YAZ_ICONV_EILSEQ;
540                 break;
541             default:
542                 cd->my_errno = YAZ_ICONV_UNKNOWN;
543             }
544         }
545         return r;
546     }
547 #endif
548     if (inbuf == 0 || *inbuf == 0)
549     {
550         cd->init_flag = 1;
551         cd->my_errno = YAZ_ICONV_UNKNOWN;
552         return 0;
553     }
554     inbuf0 = *inbuf;
555
556     if (cd->init_flag)
557     {
558         if (cd->init_handle)
559         {
560             size_t no_read;
561             size_t r = (cd->init_handle)(cd, (unsigned char *) *inbuf,
562                                          *inbytesleft, &no_read);
563             if (r)
564             {
565                 if (cd->my_errno == YAZ_ICONV_EINVAL)
566                     return r;
567                 cd->init_flag = 0;
568                 return r;
569             }
570             *inbytesleft -= no_read;
571             *inbuf += no_read;
572         }
573         cd->init_flag = 0;
574     }
575     while (1)
576     {
577         unsigned long x;
578         size_t no_read;
579
580         if (*inbytesleft == 0)
581         {
582             r = *inbuf - inbuf0;
583             break;
584         }
585         
586         x = (cd->read_handle)(cd, (unsigned char *) *inbuf, *inbytesleft,
587                               &no_read);
588         if (no_read == 0)
589         {
590             r = (size_t)(-1);
591             break;
592         }
593         if (x)
594         {
595             r = (cd->write_handle)(cd, x, outbuf, outbytesleft);
596             if (r)
597                 break;
598         }
599         *inbytesleft -= no_read;
600         (*inbuf) += no_read;
601     }
602     return r;
603 }
604
605 int yaz_iconv_error (yaz_iconv_t cd)
606 {
607     return cd->my_errno;
608 }
609
610 int yaz_iconv_close (yaz_iconv_t cd)
611 {
612 #if HAVE_ICONV_H
613     if (cd->iconv_cd)
614         iconv_close (cd->iconv_cd);
615 #endif
616     xfree (cd);
617     return 0;
618 }
619
620