b6bcf0e466d31a5b09926e85173bdcfd4228110a
[yaz-moved-to-github.git] / src / siconv.c
1 /* This file is part of the YAZ toolkit.
2  * Copyright (C) 1995-2008 Index Data
3  * See the file LICENSE for details.
4  */
5 /**
6  * \file
7  * \brief Implements simple ICONV
8  *
9  * This implements an interface similar to that of iconv and
10  * is used by YAZ to interface with iconv (if present).
11  * For systems where iconv is not present, this layer
12  * provides a few important conversions: UTF-8, MARC-8, Latin-1.
13  *
14  */
15
16 #if HAVE_CONFIG_H
17 #include <config.h>
18 #endif
19
20 #include <assert.h>
21 #include <errno.h>
22 #include <string.h>
23 #include <ctype.h>
24 #if HAVE_WCHAR_H
25 #include <wchar.h>
26 #endif
27
28 #if HAVE_ICONV_H
29 #include <iconv.h>
30 #endif
31
32 #include <yaz/xmalloc.h>
33 #include <yaz/nmem.h>
34 #include "iconv-p.h"
35
36 yaz_conv_func_t yaz_marc8_42_conv;
37 yaz_conv_func_t yaz_marc8_45_conv;
38 yaz_conv_func_t yaz_marc8_67_conv;
39 yaz_conv_func_t yaz_marc8_62_conv;
40 yaz_conv_func_t yaz_marc8_70_conv;
41 yaz_conv_func_t yaz_marc8_32_conv;
42 yaz_conv_func_t yaz_marc8_4E_conv;
43 yaz_conv_func_t yaz_marc8_51_conv;
44 yaz_conv_func_t yaz_marc8_33_conv;
45 yaz_conv_func_t yaz_marc8_34_conv;
46 yaz_conv_func_t yaz_marc8_53_conv;
47 yaz_conv_func_t yaz_marc8_31_conv;
48
49 struct yaz_iconv_struct {
50     int my_errno;
51     int init_flag;
52     size_t (*init_handle)(yaz_iconv_t cd, unsigned char *inbuf,
53                             size_t inbytesleft, size_t *no_read);
54     unsigned long (*read_handle)(yaz_iconv_t cd, unsigned char *inbuf,
55                                  size_t inbytesleft, size_t *no_read);
56     int g0_mode;
57     int g1_mode;
58
59     int comb_offset;
60     int comb_size;
61     unsigned long comb_x[8];
62     size_t comb_no_read[8];
63     size_t no_read_x;
64     unsigned long unget_x;
65 #if HAVE_ICONV_H
66     iconv_t iconv_cd;
67 #endif
68     struct yaz_iconv_encoder_s encoder;
69 };
70
71
72 static unsigned long yaz_read_ISO8859_1(yaz_iconv_t cd, unsigned char *inp,
73                                         size_t inbytesleft, size_t *no_read)
74 {
75     unsigned long x = inp[0];
76     *no_read = 1;
77     return x;
78 }
79
80 #if HAVE_WCHAR_H
81 static unsigned long yaz_read_wchar_t(yaz_iconv_t cd, unsigned char *inp,
82                                       size_t inbytesleft, size_t *no_read)
83 {
84     unsigned long x = 0;
85     
86     if (inbytesleft < sizeof(wchar_t))
87     {
88         cd->my_errno = YAZ_ICONV_EINVAL; /* incomplete input */
89         *no_read = 0;
90     }
91     else
92     {
93         wchar_t wch;
94         memcpy(&wch, inp, sizeof(wch));
95         x = wch;
96         *no_read = sizeof(wch);
97     }
98     return x;
99 }
100 #endif
101
102
103 static unsigned long yaz_read_marc8_comb(yaz_iconv_t cd, unsigned char *inp,
104                                          size_t inbytesleft, size_t *no_read,
105                                          int *comb);
106
107 static unsigned long yaz_read_marc8(yaz_iconv_t cd, unsigned char *inp,
108                                     size_t inbytesleft, size_t *no_read)
109 {
110     unsigned long x;
111     if (cd->comb_offset < cd->comb_size)
112     {
113         *no_read = cd->comb_no_read[cd->comb_offset];
114         x = cd->comb_x[cd->comb_offset];
115
116         /* special case for double-diacritic combining characters, 
117            INVERTED BREVE and DOUBLE TILDE.
118            We'll increment the no_read counter by 1, since we want to skip over
119            the processing of the closing ligature character
120         */
121         /* this code is no longer necessary.. our handlers code in
122            yaz_marc8_?_conv (generated by charconv.tcl) now returns
123            0 and no_read=1 when a sequence does not match the input.
124            The SECOND HALFs in codetables.xml produces a non-existant
125            entry in the conversion trie.. Hence when met, the input byte is
126            skipped as it should (in yaz_iconv)
127         */
128 #if 0
129         if (x == 0x0361 || x == 0x0360)
130             *no_read += 1;
131 #endif
132         cd->comb_offset++;
133         return x;
134     }
135
136     cd->comb_offset = 0;
137     for (cd->comb_size = 0; cd->comb_size < 8; cd->comb_size++)
138     {
139         int comb = 0;
140
141         if (inbytesleft == 0 && cd->comb_size)
142         {
143             cd->my_errno = YAZ_ICONV_EINVAL;
144             x = 0;
145             *no_read = 0;
146             break;
147         }
148         x = yaz_read_marc8_comb(cd, inp, inbytesleft, no_read, &comb);
149         if (!comb || !x)
150             break;
151         cd->comb_x[cd->comb_size] = x;
152         cd->comb_no_read[cd->comb_size] = *no_read;
153         inp += *no_read;
154         inbytesleft = inbytesleft - *no_read;
155     }
156     return x;
157 }
158
159 static unsigned long yaz_read_marc8s(yaz_iconv_t cd, unsigned char *inp,
160                                      size_t inbytesleft, size_t *no_read)
161 {
162     unsigned long x = yaz_read_marc8(cd, inp, inbytesleft, no_read);
163     if (x && cd->comb_size == 1)
164     {
165         if (yaz_iso_8859_1_lookup_x12(x, cd->comb_x[0], &x))
166         {
167             *no_read += cd->comb_no_read[0];
168             cd->comb_size = 0;
169         }
170     }
171     return x;
172 }
173
174 static unsigned long yaz_read_marc8_comb(yaz_iconv_t cd, unsigned char *inp,
175                                          size_t inbytesleft, size_t *no_read,
176                                          int *comb)
177 {
178     *no_read = 0;
179     while (inbytesleft > 0 && *inp == 27)
180     {
181         int *modep = &cd->g0_mode;
182         size_t inbytesleft0 = inbytesleft;
183
184         inbytesleft--;
185         inp++;
186         if (inbytesleft == 0)
187             goto incomplete;
188         if (*inp == '$') /* set with multiple bytes */
189         {
190             inbytesleft--;
191             inp++;
192         }
193         if (inbytesleft == 0)
194             goto incomplete;
195         if (*inp == '(' || *inp == ',')  /* G0 */
196         {
197             inbytesleft--;
198             inp++;
199         }
200         else if (*inp == ')' || *inp == '-') /* G1 */
201         {
202             inbytesleft--;
203             inp++;
204             modep = &cd->g1_mode;
205         }
206         if (inbytesleft == 0)
207             goto incomplete;
208         if (*inp == '!') /* ANSEL is a special case */
209         {
210             inbytesleft--;
211             inp++;
212         }
213         if (inbytesleft == 0)
214             goto incomplete;
215         *modep = *inp++; /* Final character */
216         inbytesleft--;
217
218         (*no_read) += inbytesleft0 - inbytesleft;
219     }
220     if (inbytesleft == 0)
221         return 0;
222     else if (*inp == ' ')
223     {
224         *no_read += 1;
225         return ' ';
226     }
227     else
228     {
229         unsigned long x;
230         size_t no_read_sub = 0;
231         int mode = *inp < 128 ? cd->g0_mode : cd->g1_mode;
232         *comb = 0;
233
234         switch(mode)
235         {
236         case 'B':  /* Basic ASCII */
237         case 's':  /* ASCII */
238             x = yaz_marc8_42_conv(inp, inbytesleft, &no_read_sub, comb, 127, 0);
239             break;
240         case 'E':  /* ANSEL */
241             x = yaz_marc8_45_conv(inp, inbytesleft, &no_read_sub, comb, 127, 128);
242             break;
243         case 'g':  /* Greek */
244             x = yaz_marc8_67_conv(inp, inbytesleft, &no_read_sub, comb, 127, 0);
245             break;
246         case 'b':  /* Subscripts */
247             x = yaz_marc8_62_conv(inp, inbytesleft, &no_read_sub, comb, 127, 0);
248             break;
249         case 'p':  /* Superscripts */
250             x = yaz_marc8_70_conv(inp, inbytesleft, &no_read_sub, comb, 127, 0);
251             break;
252         case '2':  /* Basic Hebrew */
253             x = yaz_marc8_32_conv(inp, inbytesleft, &no_read_sub, comb, 127, 0);
254             break;
255         case 'N':  /* Basic Cyrillic */
256             x = yaz_marc8_4E_conv(inp, inbytesleft, &no_read_sub, comb, 127, 0);
257             break;
258         case 'Q':  /* Extended Cyrillic */
259             x = yaz_marc8_51_conv(inp, inbytesleft, &no_read_sub, comb, 127, 0);
260             break;
261         case '3':  /* Basic Arabic */
262             x = yaz_marc8_33_conv(inp, inbytesleft, &no_read_sub, comb, 127, 0);
263             break;
264         case '4':  /* Extended Arabic */
265             x = yaz_marc8_34_conv(inp, inbytesleft, &no_read_sub, comb, 127, 0);
266             break;
267         case 'S':  /* Greek */
268             x = yaz_marc8_53_conv(inp, inbytesleft, &no_read_sub, comb, 127, 0);
269             break;
270         case '1':  /* Chinese, Japanese, Korean (EACC) */
271             x = yaz_marc8_31_conv(inp, inbytesleft, &no_read_sub, comb, 127, 0);
272             break;
273         default:
274             *no_read = 0;
275             cd->my_errno = YAZ_ICONV_EILSEQ;
276             return 0;
277         }
278         *no_read += no_read_sub;
279         return x;
280     }
281 incomplete:
282     *no_read = 0;
283     cd->my_errno = YAZ_ICONV_EINVAL;
284     return 0;
285 }
286
287
288
289 int yaz_iconv_isbuiltin(yaz_iconv_t cd)
290 {
291     return cd->read_handle && cd->encoder.write_handle;
292 }
293
294
295 static int prepare_encoders(yaz_iconv_t cd, const char *tocode)
296 {
297     if (yaz_marc8_encoder(tocode, &cd->encoder))
298         return 1;
299     if (yaz_utf8_encoder(tocode, &cd->encoder))
300         return 1;
301     if (yaz_ucs4_encoder(tocode, &cd->encoder))
302         return 1;
303     if (yaz_iso_8859_1_encoder(tocode, &cd->encoder))
304         return 1;
305     if (yaz_iso_5428_encoder(tocode, &cd->encoder))
306         return 1;
307     if (yaz_advancegreek_encoder(tocode, &cd->encoder))
308         return 1;
309     if (yaz_wchar_encoder(tocode, &cd->encoder))
310         return 1;
311     return 0;
312 }
313
314 yaz_iconv_t yaz_iconv_open(const char *tocode, const char *fromcode)
315 {
316     yaz_iconv_t cd = (yaz_iconv_t) xmalloc (sizeof(*cd));
317
318     cd->encoder.data = 0;
319     cd->encoder.write_handle = 0;
320     cd->encoder.flush_handle = 0;
321     cd->encoder.init_handle = 0;
322     cd->encoder.destroy_handle = 0;
323
324     cd->read_handle = 0;
325     cd->init_handle = 0;
326     cd->my_errno = YAZ_ICONV_UNKNOWN;
327
328     /* a useful hack: if fromcode has leading @,
329        the library not use YAZ's own conversions .. */
330     if (fromcode[0] == '@')
331         fromcode++;
332     else
333     {
334         if (!yaz_matchstr(fromcode, "UTF8"))
335         {
336             cd->read_handle = yaz_read_UTF8;
337             cd->init_handle = yaz_init_UTF8;
338         }
339         else if (!yaz_matchstr(fromcode, "ISO88591"))
340             cd->read_handle = yaz_read_ISO8859_1;
341         else if (!yaz_matchstr(fromcode, "UCS4"))
342             cd->read_handle = yaz_read_UCS4;
343         else if (!yaz_matchstr(fromcode, "UCS4LE"))
344             cd->read_handle = yaz_read_UCS4LE;
345         else if (!yaz_matchstr(fromcode, "MARC8"))
346             cd->read_handle = yaz_read_marc8;
347         else if (!yaz_matchstr(fromcode, "MARC8s"))
348             cd->read_handle = yaz_read_marc8s;
349         else if (!yaz_matchstr(fromcode, "advancegreek"))
350             cd->read_handle = yaz_read_advancegreek;
351         else if (!yaz_matchstr(fromcode, "iso54281984"))
352             cd->read_handle = yaz_read_iso5428_1984;
353         else if (!yaz_matchstr(fromcode, "iso5428:1984"))
354             cd->read_handle = yaz_read_iso5428_1984;
355 #if HAVE_WCHAR_H
356         else if (!yaz_matchstr(fromcode, "WCHAR_T"))
357             cd->read_handle = yaz_read_wchar_t;
358 #endif
359         prepare_encoders(cd, tocode);
360     }
361     if (cd->read_handle && cd->encoder.write_handle)
362     {
363 #if HAVE_ICONV_H
364         cd->iconv_cd = 0;
365 #endif
366         ;
367     }
368     else
369     {
370 #if HAVE_ICONV_H
371         cd->iconv_cd = iconv_open(tocode, fromcode);
372         if (cd->iconv_cd == (iconv_t) (-1))
373         {
374             yaz_iconv_close(cd);
375             return 0;
376         }
377 #else
378         yaz_iconv_close(cd);
379         return 0;
380 #endif
381     }
382     cd->init_flag = 1;
383     return cd;
384 }
385
386 size_t yaz_iconv(yaz_iconv_t cd, char **inbuf, size_t *inbytesleft,
387                  char **outbuf, size_t *outbytesleft)
388 {
389     char *inbuf0 = 0;
390     size_t r = 0;
391
392 #if HAVE_ICONV_H
393     if (cd->iconv_cd)
394     {
395         size_t r =
396             iconv(cd->iconv_cd, inbuf, inbytesleft, outbuf, outbytesleft);
397         if (r == (size_t)(-1))
398         {
399             switch (yaz_errno())
400             {
401             case E2BIG:
402                 cd->my_errno = YAZ_ICONV_E2BIG;
403                 break;
404             case EINVAL:
405                 cd->my_errno = YAZ_ICONV_EINVAL;
406                 break;
407             case EILSEQ:
408                 cd->my_errno = YAZ_ICONV_EILSEQ;
409                 break;
410             default:
411                 cd->my_errno = YAZ_ICONV_UNKNOWN;
412             }
413         }
414         return r;
415     }
416 #endif
417
418     if (inbuf)
419         inbuf0 = *inbuf;
420
421     if (cd->init_flag)
422     {
423         cd->my_errno = YAZ_ICONV_UNKNOWN;
424         cd->g0_mode = 'B';
425         cd->g1_mode = 'E';
426         
427         cd->comb_offset = cd->comb_size = 0;
428
429         if (cd->encoder.init_handle)
430             (*cd->encoder.init_handle)(&cd->encoder);
431         
432         cd->unget_x = 0;
433         cd->no_read_x = 0;
434
435         if (cd->init_handle && inbuf && *inbuf)
436         {
437             size_t no_read = 0;
438             size_t r = (cd->init_handle)(cd, (unsigned char *) *inbuf,
439                                          *inbytesleft, &no_read);
440             if (r)
441             {
442                 if (cd->my_errno == YAZ_ICONV_EINVAL)
443                     return r;
444                 cd->init_flag = 0;
445                 return r;
446             }
447             *inbytesleft -= no_read;
448             *inbuf += no_read;
449         }
450     }
451     cd->init_flag = 0;
452
453     if (!inbuf || !*inbuf)
454     {
455         if (outbuf && *outbuf)
456         {
457             if (cd->unget_x)
458                 r = (*cd->encoder.write_handle)(cd, &cd->encoder,
459                                                 cd->unget_x, outbuf, outbytesleft);
460             if (cd->encoder.flush_handle)
461                 r = (*cd->encoder.flush_handle)(cd, &cd->encoder,
462                                                 outbuf, outbytesleft);
463         }
464         if (r == 0)
465             cd->init_flag = 1;
466         cd->unget_x = 0;
467         return r;
468     }
469     while (1)
470     {
471         unsigned long x;
472         size_t no_read;
473
474         if (cd->unget_x)
475         {
476             x = cd->unget_x;
477             no_read = cd->no_read_x;
478         }
479         else
480         {
481             if (*inbytesleft == 0)
482             {
483                 r = *inbuf - inbuf0;
484                 break;
485             }
486             x = (*cd->read_handle)(cd, (unsigned char *) *inbuf, *inbytesleft,
487                                    &no_read);
488             if (no_read == 0)
489             {
490                 r = (size_t)(-1);
491                 break;
492             }
493         }
494         if (x)
495         {
496             r = (*cd->encoder.write_handle)(cd, &cd->encoder,
497                                             x, outbuf, outbytesleft);
498             if (r)
499             {
500                 /* unable to write it. save it because read_handle cannot
501                    rewind .. */
502                 if (cd->my_errno == YAZ_ICONV_E2BIG)
503                 {
504                     cd->unget_x = x;
505                     cd->no_read_x = no_read;
506                     break;
507                 }
508             }
509             cd->unget_x = 0;
510         }
511         *inbytesleft -= no_read;
512         (*inbuf) += no_read;
513     }
514     return r;
515 }
516
517 int yaz_iconv_error(yaz_iconv_t cd)
518 {
519     return cd->my_errno;
520 }
521
522 int yaz_iconv_close(yaz_iconv_t cd)
523 {
524 #if HAVE_ICONV_H
525     if (cd->iconv_cd)
526         iconv_close(cd->iconv_cd);
527 #endif
528     if (cd->encoder.destroy_handle)
529         (*cd->encoder.destroy_handle)(&cd->encoder);
530     xfree(cd);
531     return 0;
532 }
533
534 void yaz_iconv_set_errno(yaz_iconv_t cd, int no)
535 {
536     cd->my_errno = no;
537 }
538
539 /*
540  * Local variables:
541  * c-basic-offset: 4
542  * indent-tabs-mode: nil
543  * End:
544  * vim: shiftwidth=4 tabstop=8 expandtab
545  */