Handle G1 in MARC-8 decoding.
[yaz-moved-to-github.git] / src / siconv.c
1 /*
2  * Copyright (C) 1995-2008, Index Data ApS
3  * See the file LICENSE for details.
4  *
5  * $Id: siconv.c,v 1.50 2008-03-12 08:53:28 adam Exp $
6  */
7 /**
8  * \file siconv.c
9  * \brief Implements simple ICONV
10  *
11  * This implements an interface similar to that of iconv and
12  * is used by YAZ to interface with iconv (if present).
13  * For systems where iconv is not present, this layer
14  * provides a few important conversions: UTF-8, MARC-8, Latin-1.
15  *
16  * MARC-8 reference:
17  *  http://www.loc.gov/marc/specifications/speccharmarc8.html
18  */
19
20 #if HAVE_CONFIG_H
21 #include <config.h>
22 #endif
23
24 #include <assert.h>
25 #include <errno.h>
26 #include <string.h>
27 #include <ctype.h>
28 #if HAVE_WCHAR_H
29 #include <wchar.h>
30 #endif
31
32 #if HAVE_ICONV_H
33 #include <iconv.h>
34 #endif
35
36 #include <yaz/xmalloc.h>
37 #include <yaz/nmem.h>
38 #include "iconv-p.h"
39
40 typedef unsigned long yaz_conv_func_t(unsigned char *inp, size_t inbytesleft,
41                                       size_t *no_read, int *combining,
42                                       unsigned mask, int boffset);
43
44
45 yaz_conv_func_t yaz_marc8_42_conv;
46 yaz_conv_func_t yaz_marc8_45_conv;
47 yaz_conv_func_t yaz_marc8_67_conv;
48 yaz_conv_func_t yaz_marc8_62_conv;
49 yaz_conv_func_t yaz_marc8_70_conv;
50 yaz_conv_func_t yaz_marc8_32_conv;
51 yaz_conv_func_t yaz_marc8_4E_conv;
52 yaz_conv_func_t yaz_marc8_51_conv;
53 yaz_conv_func_t yaz_marc8_33_conv;
54 yaz_conv_func_t yaz_marc8_34_conv;
55 yaz_conv_func_t yaz_marc8_53_conv;
56 yaz_conv_func_t yaz_marc8_31_conv;
57
58 yaz_conv_func_t yaz_marc8r_42_conv;
59 yaz_conv_func_t yaz_marc8r_45_conv;
60 yaz_conv_func_t yaz_marc8r_67_conv;
61 yaz_conv_func_t yaz_marc8r_62_conv;
62 yaz_conv_func_t yaz_marc8r_70_conv;
63 yaz_conv_func_t yaz_marc8r_32_conv;
64 yaz_conv_func_t yaz_marc8r_4E_conv;
65 yaz_conv_func_t yaz_marc8r_51_conv;
66 yaz_conv_func_t yaz_marc8r_33_conv;
67 yaz_conv_func_t yaz_marc8r_34_conv;
68 yaz_conv_func_t yaz_marc8r_53_conv;
69 yaz_conv_func_t yaz_marc8r_31_conv;
70
71 struct yaz_iconv_struct {
72     int my_errno;
73     int init_flag;
74     size_t (*init_handle)(yaz_iconv_t cd, unsigned char *inbuf,
75                           size_t inbytesleft, size_t *no_read);
76     unsigned long (*read_handle)(yaz_iconv_t cd, unsigned char *inbuf,
77                                  size_t inbytesleft, size_t *no_read);
78     size_t (*write_handle)(yaz_iconv_t cd, unsigned long x,
79                            char **outbuf, size_t *outbytesleft);
80     size_t (*flush_handle)(yaz_iconv_t cd,
81                            char **outbuf, size_t *outbytesleft);
82     int g0_mode;
83     int g1_mode;
84
85     int comb_offset;
86     int comb_size;
87     unsigned long comb_x[8];
88     size_t comb_no_read[8];
89     size_t no_read_x;
90     unsigned long unget_x;
91 #if HAVE_ICONV_H
92     iconv_t iconv_cd;
93 #endif
94     unsigned long compose_char;
95
96     unsigned write_marc8_second_half_char;
97     unsigned long write_marc8_last;
98     const char *write_marc8_lpage;
99     const char *write_marc8_g0;
100     const char *write_marc8_g1;
101 };
102
103
104 static struct {
105     unsigned long x1, x2;
106     unsigned y;
107 } latin1_comb[] = {
108     { 'A', 0x0300, 0xc0}, /* LATIN CAPITAL LETTER A WITH GRAVE */
109     { 'A', 0x0301, 0xc1}, /* LATIN CAPITAL LETTER A WITH ACUTE */
110     { 'A', 0x0302, 0xc2}, /* LATIN CAPITAL LETTER A WITH CIRCUMFLEX */
111     { 'A', 0x0303, 0xc3}, /* LATIN CAPITAL LETTER A WITH TILDE */
112     { 'A', 0x0308, 0xc4}, /* LATIN CAPITAL LETTER A WITH DIAERESIS */
113     { 'A', 0x030a, 0xc5}, /* LATIN CAPITAL LETTER A WITH RING ABOVE */
114     /* no need for 0xc6      LATIN CAPITAL LETTER AE */
115     { 'C', 0x0327, 0xc7}, /* LATIN CAPITAL LETTER C WITH CEDILLA */
116     { 'E', 0x0300, 0xc8}, /* LATIN CAPITAL LETTER E WITH GRAVE */
117     { 'E', 0x0301, 0xc9}, /* LATIN CAPITAL LETTER E WITH ACUTE */
118     { 'E', 0x0302, 0xca}, /* LATIN CAPITAL LETTER E WITH CIRCUMFLEX */
119     { 'E', 0x0308, 0xcb}, /* LATIN CAPITAL LETTER E WITH DIAERESIS */
120     { 'I', 0x0300, 0xcc}, /* LATIN CAPITAL LETTER I WITH GRAVE */
121     { 'I', 0x0301, 0xcd}, /* LATIN CAPITAL LETTER I WITH ACUTE */
122     { 'I', 0x0302, 0xce}, /* LATIN CAPITAL LETTER I WITH CIRCUMFLEX */
123     { 'I', 0x0308, 0xcf}, /* LATIN CAPITAL LETTER I WITH DIAERESIS */
124     { 'N', 0x0303, 0xd1}, /* LATIN CAPITAL LETTER N WITH TILDE */
125     { 'O', 0x0300, 0xd2}, /* LATIN CAPITAL LETTER O WITH GRAVE */
126     { 'O', 0x0301, 0xd3}, /* LATIN CAPITAL LETTER O WITH ACUTE */
127     { 'O', 0x0302, 0xd4}, /* LATIN CAPITAL LETTER O WITH CIRCUMFLEX */
128     { 'O', 0x0303, 0xd5}, /* LATIN CAPITAL LETTER O WITH TILDE */
129     { 'O', 0x0308, 0xd6}, /* LATIN CAPITAL LETTER O WITH DIAERESIS */
130     /* omitted:    0xd7      MULTIPLICATION SIGN */
131     /* omitted:    0xd8      LATIN CAPITAL LETTER O WITH STROKE */
132     { 'U', 0x0300, 0xd9}, /* LATIN CAPITAL LETTER U WITH GRAVE */
133     { 'U', 0x0301, 0xda}, /* LATIN CAPITAL LETTER U WITH ACUTE */
134     { 'U', 0x0302, 0xdb}, /* LATIN CAPITAL LETTER U WITH CIRCUMFLEX */
135     { 'U', 0x0308, 0xdc}, /* LATIN CAPITAL LETTER U WITH DIAERESIS */
136     { 'Y', 0x0301, 0xdd}, /* LATIN CAPITAL LETTER Y WITH ACUTE */
137     /* omitted:    0xde      LATIN CAPITAL LETTER THORN */
138     /* omitted:    0xdf      LATIN SMALL LETTER SHARP S */
139     { 'a', 0x0300, 0xe0}, /* LATIN SMALL LETTER A WITH GRAVE */
140     { 'a', 0x0301, 0xe1}, /* LATIN SMALL LETTER A WITH ACUTE */
141     { 'a', 0x0302, 0xe2}, /* LATIN SMALL LETTER A WITH CIRCUMFLEX */
142     { 'a', 0x0303, 0xe3}, /* LATIN SMALL LETTER A WITH TILDE */
143     { 'a', 0x0308, 0xe4}, /* LATIN SMALL LETTER A WITH DIAERESIS */
144     { 'a', 0x030a, 0xe5}, /* LATIN SMALL LETTER A WITH RING ABOVE */
145     /* omitted:    0xe6      LATIN SMALL LETTER AE */
146     { 'c', 0x0327, 0xe7}, /* LATIN SMALL LETTER C WITH CEDILLA */
147     { 'e', 0x0300, 0xe8}, /* LATIN SMALL LETTER E WITH GRAVE */
148     { 'e', 0x0301, 0xe9}, /* LATIN SMALL LETTER E WITH ACUTE */
149     { 'e', 0x0302, 0xea}, /* LATIN SMALL LETTER E WITH CIRCUMFLEX */
150     { 'e', 0x0308, 0xeb}, /* LATIN SMALL LETTER E WITH DIAERESIS */
151     { 'i', 0x0300, 0xec}, /* LATIN SMALL LETTER I WITH GRAVE */
152     { 'i', 0x0301, 0xed}, /* LATIN SMALL LETTER I WITH ACUTE */
153     { 'i', 0x0302, 0xee}, /* LATIN SMALL LETTER I WITH CIRCUMFLEX */
154     { 'i', 0x0308, 0xef}, /* LATIN SMALL LETTER I WITH DIAERESIS */
155     /* omitted:    0xf0      LATIN SMALL LETTER ETH */
156     { 'n', 0x0303, 0xf1}, /* LATIN SMALL LETTER N WITH TILDE */
157     { 'o', 0x0300, 0xf2}, /* LATIN SMALL LETTER O WITH GRAVE */
158     { 'o', 0x0301, 0xf3}, /* LATIN SMALL LETTER O WITH ACUTE */
159     { 'o', 0x0302, 0xf4}, /* LATIN SMALL LETTER O WITH CIRCUMFLEX */
160     { 'o', 0x0303, 0xf5}, /* LATIN SMALL LETTER O WITH TILDE */
161     { 'o', 0x0308, 0xf6}, /* LATIN SMALL LETTER O WITH DIAERESIS */
162     /* omitted:    0xf7      DIVISION SIGN */
163     /* omitted:    0xf8      LATIN SMALL LETTER O WITH STROKE */
164     { 'u', 0x0300, 0xf9}, /* LATIN SMALL LETTER U WITH GRAVE */
165     { 'u', 0x0301, 0xfa}, /* LATIN SMALL LETTER U WITH ACUTE */
166     { 'u', 0x0302, 0xfb}, /* LATIN SMALL LETTER U WITH CIRCUMFLEX */
167     { 'u', 0x0308, 0xfc}, /* LATIN SMALL LETTER U WITH DIAERESIS */
168     { 'y', 0x0301, 0xfd}, /* LATIN SMALL LETTER Y WITH ACUTE */
169     /* omitted:    0xfe      LATIN SMALL LETTER THORN */
170     { 'y', 0x0308, 0xff}, /* LATIN SMALL LETTER Y WITH DIAERESIS */
171     
172     { 0, 0, 0}
173 };
174
175 #define ESC "\033"
176
177 static size_t yaz_write_marc8_page_chr(yaz_iconv_t cd, 
178                                        char **outbuf, size_t *outbytesleft,
179                                        const char *page_chr);
180
181 static unsigned long yaz_read_ISO8859_1(yaz_iconv_t cd, unsigned char *inp,
182                                         size_t inbytesleft, size_t *no_read)
183 {
184     unsigned long x = inp[0];
185     *no_read = 1;
186     return x;
187 }
188
189
190
191 #if HAVE_WCHAR_H
192 static unsigned long yaz_read_wchar_t (yaz_iconv_t cd, unsigned char *inp,
193                                        size_t inbytesleft, size_t *no_read)
194 {
195     unsigned long x = 0;
196     
197     if (inbytesleft < sizeof(wchar_t))
198     {
199         cd->my_errno = YAZ_ICONV_EINVAL; /* incomplete input */
200         *no_read = 0;
201     }
202     else
203     {
204         wchar_t wch;
205         memcpy (&wch, inp, sizeof(wch));
206         x = wch;
207         *no_read = sizeof(wch);
208     }
209     return x;
210 }
211 #endif
212
213
214 static unsigned long yaz_read_marc8_comb (yaz_iconv_t cd, unsigned char *inp,
215                                           size_t inbytesleft, size_t *no_read,
216                                           int *comb);
217
218 static unsigned long yaz_read_marc8 (yaz_iconv_t cd, unsigned char *inp,
219                                      size_t inbytesleft, size_t *no_read)
220 {
221     unsigned long x;
222     if (cd->comb_offset < cd->comb_size)
223     {
224         *no_read = cd->comb_no_read[cd->comb_offset];
225         x = cd->comb_x[cd->comb_offset];
226
227         /* special case for double-diacritic combining characters, 
228            INVERTED BREVE and DOUBLE TILDE.
229            We'll increment the no_read counter by 1, since we want to skip over
230            the processing of the closing ligature character
231         */
232         /* this code is no longer necessary.. our handlers code in
233            yaz_marc8_?_conv (generated by charconv.tcl) now returns
234            0 and no_read=1 when a sequence does not match the input.
235            The SECOND HALFs in codetables.xml produces a non-existant
236            entry in the conversion trie.. Hence when met, the input byte is
237            skipped as it should (in yaz_iconv)
238         */
239 #if 0
240         if (x == 0x0361 || x == 0x0360)
241             *no_read += 1;
242 #endif
243         cd->comb_offset++;
244         return x;
245     }
246
247     cd->comb_offset = 0;
248     for (cd->comb_size = 0; cd->comb_size < 8; cd->comb_size++)
249     {
250         int comb = 0;
251
252         if (inbytesleft == 0 && cd->comb_size)
253         {
254             cd->my_errno = YAZ_ICONV_EINVAL;
255             x = 0;
256             *no_read = 0;
257             break;
258         }
259         x = yaz_read_marc8_comb(cd, inp, inbytesleft, no_read, &comb);
260         if (!comb || !x)
261             break;
262         cd->comb_x[cd->comb_size] = x;
263         cd->comb_no_read[cd->comb_size] = *no_read;
264         inp += *no_read;
265         inbytesleft = inbytesleft - *no_read;
266     }
267     return x;
268 }
269
270 static unsigned long yaz_read_marc8s(yaz_iconv_t cd, unsigned char *inp,
271                                      size_t inbytesleft, size_t *no_read)
272 {
273     unsigned long x = yaz_read_marc8(cd, inp, inbytesleft, no_read);
274     if (x && cd->comb_size == 1)
275     {
276         /* For MARC8s we try to get a Latin-1 page code out of it */
277         int i;
278         for (i = 0; latin1_comb[i].x1; i++)
279             if (cd->comb_x[0] == latin1_comb[i].x2 && x == latin1_comb[i].x1)
280             {
281                 *no_read += cd->comb_no_read[0];
282                 cd->comb_size = 0;
283                 x = latin1_comb[i].y;
284                 break;
285             }
286     }
287     return x;
288 }
289
290 static unsigned long yaz_read_marc8_comb(yaz_iconv_t cd, unsigned char *inp,
291                                          size_t inbytesleft, size_t *no_read,
292                                          int *comb)
293 {
294     *no_read = 0;
295     while (inbytesleft > 0 && *inp == 27)
296     {
297         int *modep = &cd->g0_mode;
298         size_t inbytesleft0 = inbytesleft;
299
300         inbytesleft--;
301         inp++;
302         if (inbytesleft == 0)
303             goto incomplete;
304         if (*inp == '$') /* set with multiple bytes */
305         {
306             inbytesleft--;
307             inp++;
308         }
309         if (inbytesleft == 0)
310             goto incomplete;
311         if (*inp == '(' || *inp == ',')  /* G0 */
312         {
313             inbytesleft--;
314             inp++;
315         }
316         else if (*inp == ')' || *inp == '-') /* G1 */
317         {
318             inbytesleft--;
319             inp++;
320             modep = &cd->g1_mode;
321         }
322         if (inbytesleft == 0)
323             goto incomplete;
324         if (*inp == '!') /* ANSEL is a special case */
325         {
326             inbytesleft--;
327             inp++;
328         }
329         if (inbytesleft == 0)
330             goto incomplete;
331         *modep = *inp++; /* Final character */
332         inbytesleft--;
333
334         (*no_read) += inbytesleft0 - inbytesleft;
335     }
336     if (inbytesleft == 0)
337         return 0;
338     else if (*inp == ' ')
339     {
340         *no_read += 1;
341         return ' ';
342     }
343     else
344     {
345         unsigned long x;
346         size_t no_read_sub = 0;
347         int mode = *inp < 128 ? cd->g0_mode : cd->g1_mode;
348         *comb = 0;
349
350         switch(mode)
351         {
352         case 'B':  /* Basic ASCII */
353         case 's':  /* ASCII */
354             x = yaz_marc8_42_conv(inp, inbytesleft, &no_read_sub, comb, 127, 0);
355             break;
356         case 'E':  /* ANSEL */
357             x = yaz_marc8_45_conv(inp, inbytesleft, &no_read_sub, comb, 127, 128);
358             break;
359         case 'g':  /* Greek */
360             x = yaz_marc8_67_conv(inp, inbytesleft, &no_read_sub, comb, 127, 0);
361             break;
362         case 'b':  /* Subscripts */
363             x = yaz_marc8_62_conv(inp, inbytesleft, &no_read_sub, comb, 127, 0);
364             break;
365         case 'p':  /* Superscripts */
366             x = yaz_marc8_70_conv(inp, inbytesleft, &no_read_sub, comb, 127, 0);
367             break;
368         case '2':  /* Basic Hebrew */
369             x = yaz_marc8_32_conv(inp, inbytesleft, &no_read_sub, comb, 127, 0);
370             break;
371         case 'N':  /* Basic Cyrillic */
372             x = yaz_marc8_4E_conv(inp, inbytesleft, &no_read_sub, comb, 127, 0);
373             break;
374         case 'Q':  /* Extended Cyrillic */
375             x = yaz_marc8_51_conv(inp, inbytesleft, &no_read_sub, comb, 127, 0);
376             break;
377         case '3':  /* Basic Arabic */
378             x = yaz_marc8_33_conv(inp, inbytesleft, &no_read_sub, comb, 127, 0);
379             break;
380         case '4':  /* Extended Arabic */
381             x = yaz_marc8_34_conv(inp, inbytesleft, &no_read_sub, comb, 127, 0);
382             break;
383         case 'S':  /* Greek */
384             x = yaz_marc8_53_conv(inp, inbytesleft, &no_read_sub, comb, 127, 0);
385             break;
386         case '1':  /* Chinese, Japanese, Korean (EACC) */
387             x = yaz_marc8_31_conv(inp, inbytesleft, &no_read_sub, comb, 127, 0);
388             break;
389         default:
390             *no_read = 0;
391             cd->my_errno = YAZ_ICONV_EILSEQ;
392             return 0;
393         }
394         *no_read += no_read_sub;
395         return x;
396     }
397 incomplete:
398     *no_read = 0;
399     cd->my_errno = YAZ_ICONV_EINVAL;
400     return 0;
401 }
402
403 static size_t yaz_write_ISO8859_1(yaz_iconv_t cd, unsigned long x,
404                                   char **outbuf, size_t *outbytesleft)
405 {
406     /* list of two char unicode sequence that, when combined, are
407        equivalent to single unicode chars that can be represented in
408        ISO-8859-1/Latin-1.
409        Regular iconv on Linux at least does not seem to convert these,
410        but since MARC-8 to UTF-8 generates these composed sequence
411        we get a better chance of a successful MARC-8 -> ISO-8859-1
412        conversion */
413     unsigned char *outp = (unsigned char *) *outbuf;
414
415     if (cd->compose_char)
416     {
417         int i;
418         for (i = 0; latin1_comb[i].x1; i++)
419             if (cd->compose_char == latin1_comb[i].x1 && x == latin1_comb[i].x2)
420             {
421                 x = latin1_comb[i].y;
422                 break;
423             }
424         if (*outbytesleft < 1)
425         {  /* no room. Retain compose_char and bail out */
426             cd->my_errno = YAZ_ICONV_E2BIG;
427             return (size_t)(-1);
428         }
429         if (!latin1_comb[i].x1) 
430         {   /* not found. Just write compose_char */
431             *outp++ = (unsigned char) cd->compose_char;
432             (*outbytesleft)--;
433             *outbuf = (char *) outp;
434         }
435         /* compose_char used so reset it. x now holds current char */
436         cd->compose_char = 0;
437     }
438
439     if (x > 32 && x < 127 && cd->compose_char == 0)
440     {
441         cd->compose_char = x;
442         return 0;
443     }
444     else if (x > 255 || x < 1)
445     {
446         cd->my_errno = YAZ_ICONV_EILSEQ;
447         return (size_t) -1;
448     }
449     else if (*outbytesleft < 1)
450     {
451         cd->my_errno = YAZ_ICONV_E2BIG;
452         return (size_t)(-1);
453     }
454     *outp++ = (unsigned char) x;
455     (*outbytesleft)--;
456     *outbuf = (char *) outp;
457     return 0;
458 }
459
460 static size_t yaz_flush_ISO8859_1(yaz_iconv_t cd,
461                                   char **outbuf, size_t *outbytesleft)
462 {
463     if (cd->compose_char)
464     {
465         unsigned char *outp = (unsigned char *) *outbuf;
466         if (*outbytesleft < 1)
467         {
468             cd->my_errno = YAZ_ICONV_E2BIG;
469             return (size_t)(-1);
470         }
471         *outp++ = (unsigned char) cd->compose_char;
472         (*outbytesleft)--;
473         *outbuf = (char *) outp;
474         cd->compose_char = 0;
475     }
476     return 0;
477 }
478
479 static unsigned long lookup_marc8(yaz_iconv_t cd,
480                                   unsigned long x, int *comb,
481                                   const char **page_chr)
482 {
483     char utf8_buf[7];
484     char *utf8_outbuf = utf8_buf;
485     size_t utf8_outbytesleft = sizeof(utf8_buf)-1, r;
486
487     r = yaz_write_UTF8(cd, x, &utf8_outbuf, &utf8_outbytesleft);
488     if (r == (size_t)(-1))
489     {
490         cd->my_errno = YAZ_ICONV_EILSEQ;
491         return 0;
492     }
493     else
494     {
495         unsigned char *inp;
496         size_t inbytesleft, no_read_sub = 0;
497         unsigned long x;
498
499         *utf8_outbuf = '\0';        
500         inp = (unsigned char *) utf8_buf;
501         inbytesleft = strlen(utf8_buf);
502
503         x = yaz_marc8r_42_conv(inp, inbytesleft, &no_read_sub, comb, 255, 0);
504         if (x)
505         {
506             *page_chr = ESC "(B";
507             return x;
508         }
509         x = yaz_marc8r_45_conv(inp, inbytesleft, &no_read_sub, comb, 255, 0);
510         if (x)
511         {
512             *page_chr = ESC "(B";
513             return x;
514         }
515         x = yaz_marc8r_62_conv(inp, inbytesleft, &no_read_sub, comb, 255, 0);
516         if (x)
517         {
518             *page_chr = ESC "b";
519             return x;
520         }
521         x = yaz_marc8r_70_conv(inp, inbytesleft, &no_read_sub, comb, 255, 0);
522         if (x)
523         {
524             *page_chr = ESC "p";
525             return x;
526         }
527         x = yaz_marc8r_32_conv(inp, inbytesleft, &no_read_sub, comb, 255, 0);
528         if (x)
529         {
530             *page_chr = ESC "(2";
531             return x;
532         }
533         x = yaz_marc8r_4E_conv(inp, inbytesleft, &no_read_sub, comb, 255, 0);
534         if (x)
535         {
536             *page_chr = ESC "(N";
537             return x;
538         }
539         x = yaz_marc8r_51_conv(inp, inbytesleft, &no_read_sub, comb, 255, 0);
540         if (x)
541         {
542             *page_chr = ESC "(Q";
543             return x;
544         }
545         x = yaz_marc8r_33_conv(inp, inbytesleft, &no_read_sub, comb, 255, 0);
546         if (x)
547         {
548             *page_chr = ESC "(3";
549             return x;
550         }
551         x = yaz_marc8r_34_conv(inp, inbytesleft, &no_read_sub, comb, 255, 0);
552         if (x)
553         {
554             *page_chr = ESC "(4";
555             return x;
556         }
557         x = yaz_marc8r_53_conv(inp, inbytesleft, &no_read_sub, comb, 255, 0);
558         if (x)
559         {
560             *page_chr = ESC "(S";
561             return x;
562         }
563         x = yaz_marc8r_31_conv(inp, inbytesleft, &no_read_sub, comb, 255, 0);
564         if (x)
565         {
566             *page_chr = ESC "$1";
567             return x;
568         }
569         cd->my_errno = YAZ_ICONV_EILSEQ;
570         return x;
571     }
572 }
573
574 static size_t flush_combos(yaz_iconv_t cd,
575                            char **outbuf, size_t *outbytesleft)
576 {
577     unsigned long y = cd->write_marc8_last;
578     unsigned char byte;
579     char out_buf[4];
580     size_t out_no = 0;
581
582     if (!y)
583         return 0;
584
585     assert(cd->write_marc8_lpage);
586     if (cd->write_marc8_lpage)
587     {
588         size_t r = yaz_write_marc8_page_chr(cd, outbuf, outbytesleft,
589                                             cd->write_marc8_lpage);
590         if (r)
591             return r;
592     }
593
594     byte = (unsigned char )((y>>16) & 0xff);
595     if (byte)
596         out_buf[out_no++] = byte;
597     byte = (unsigned char)((y>>8) & 0xff);
598     if (byte)
599         out_buf[out_no++] = byte;
600     byte = (unsigned char )(y & 0xff);
601     if (byte)
602         out_buf[out_no++] = byte;
603
604     if (out_no + 2 >= *outbytesleft)
605     {
606         cd->my_errno = YAZ_ICONV_E2BIG;
607         return (size_t) (-1);
608     }
609
610     memcpy(*outbuf, out_buf, out_no);
611     *outbuf += out_no;
612     (*outbytesleft) -= out_no;
613     if (cd->write_marc8_second_half_char)
614     {
615         *(*outbuf)++ = cd->write_marc8_second_half_char;
616         (*outbytesleft)--;
617     }        
618
619     cd->write_marc8_last = 0;
620     cd->write_marc8_lpage = 0;
621     cd->write_marc8_second_half_char = 0;
622     return 0;
623 }
624
625 static size_t yaz_write_marc8_page_chr(yaz_iconv_t cd, 
626                                        char **outbuf, size_t *outbytesleft,
627                                        const char *page_chr)
628 {
629     const char **old_page_chr = &cd->write_marc8_g0;
630
631     /* are we going to a G1-set (such as such as ESC ")!E") */
632     if (page_chr && page_chr[1] == ')')
633         old_page_chr = &cd->write_marc8_g1;
634
635     if (!*old_page_chr || strcmp(page_chr, *old_page_chr))
636     {
637         size_t plen = 0;
638         const char *page_out = page_chr;
639         
640         if (*outbytesleft < 8)
641         {
642             cd->my_errno = YAZ_ICONV_E2BIG;
643             
644             return (size_t) (-1);
645         }
646
647         if (*old_page_chr)
648         {
649             if (!strcmp(*old_page_chr, ESC "p") 
650                 || !strcmp(*old_page_chr, ESC "g")
651                 || !strcmp(*old_page_chr, ESC "b"))
652             {
653                 page_out = ESC "s";
654                 /* Technique 1 leave */
655                 if (strcmp(page_chr, ESC "(B")) /* Not going ASCII page? */
656                 {
657                     /* Must leave script + enter new page */
658                     plen = strlen(page_out);
659                     memcpy(*outbuf, page_out, plen);
660                     (*outbuf) += plen;
661                     (*outbytesleft) -= plen;
662                     page_out = ESC "(B";
663                 }
664             }
665         }
666         *old_page_chr = page_chr;
667         plen = strlen(page_out);
668         memcpy(*outbuf, page_out, plen);
669         (*outbuf) += plen;
670         (*outbytesleft) -= plen;
671     }
672     return 0;
673 }
674
675
676 static size_t yaz_write_marc8_2(yaz_iconv_t cd, unsigned long x,
677                                 char **outbuf, size_t *outbytesleft)
678 {
679     int comb = 0;
680     const char *page_chr = 0;
681     unsigned long y = lookup_marc8(cd, x, &comb, &page_chr);
682
683     if (!y)
684         return (size_t) (-1);
685
686     if (comb)
687     {
688         if (page_chr)
689         {
690             size_t r = yaz_write_marc8_page_chr(cd, outbuf, outbytesleft,
691                                                 page_chr);
692             if (r)
693                 return r;
694         }
695         if (x == 0x0361)
696             cd->write_marc8_second_half_char = 0xEC;
697         else if (x == 0x0360)
698             cd->write_marc8_second_half_char = 0xFB;
699
700         if (*outbytesleft <= 1)
701         {
702             cd->my_errno = YAZ_ICONV_E2BIG;
703             return (size_t) (-1);
704         }
705         *(*outbuf)++ = y;
706         (*outbytesleft)--;
707     }
708     else
709     {
710         size_t r = flush_combos(cd, outbuf, outbytesleft);
711         if (r)
712             return r;
713
714         cd->write_marc8_last = y;
715         cd->write_marc8_lpage = page_chr;
716     }
717     return 0;
718 }
719
720 static size_t yaz_flush_marc8(yaz_iconv_t cd,
721                               char **outbuf, size_t *outbytesleft)
722 {
723     size_t r = flush_combos(cd, outbuf, outbytesleft);
724     if (r)
725         return r;
726     cd->write_marc8_g1 = 0;
727     return yaz_write_marc8_page_chr(cd, outbuf, outbytesleft, ESC "(B");
728 }
729
730 static size_t yaz_write_marc8(yaz_iconv_t cd, unsigned long x,
731                               char **outbuf, size_t *outbytesleft)
732 {
733     int i;
734     for (i = 0; latin1_comb[i].x1; i++)
735     {
736         if (x == latin1_comb[i].y)
737         {
738             size_t r ;
739             /* save the output pointers .. */
740             char *outbuf0 = *outbuf;
741             size_t outbytesleft0 = *outbytesleft;
742             int last_ch = cd->write_marc8_last;
743             const char *lpage = cd->write_marc8_lpage;
744
745             r = yaz_write_marc8_2(cd, latin1_comb[i].x1,
746                                   outbuf, outbytesleft);
747             if (r)
748                 return r;
749             r = yaz_write_marc8_2(cd, latin1_comb[i].x2,
750                                   outbuf, outbytesleft);
751             if (r && cd->my_errno == YAZ_ICONV_E2BIG)
752             {
753                 /* not enough room. reset output to original values */
754                 *outbuf = outbuf0;
755                 *outbytesleft = outbytesleft0;
756                 cd->write_marc8_last = last_ch;
757                 cd->write_marc8_lpage = lpage;
758             }
759             return r;
760         }
761     }
762     return yaz_write_marc8_2(cd, x, outbuf, outbytesleft);
763 }
764
765
766 #if HAVE_WCHAR_H
767 static size_t yaz_write_wchar_t(yaz_iconv_t cd, unsigned long x,
768                                 char **outbuf, size_t *outbytesleft)
769 {
770     unsigned char *outp = (unsigned char *) *outbuf;
771
772     if (*outbytesleft >= sizeof(wchar_t))
773     {
774         wchar_t wch = x;
775         memcpy(outp, &wch, sizeof(wch));
776         outp += sizeof(wch);
777         (*outbytesleft) -= sizeof(wch);
778     }
779     else
780     {
781         cd->my_errno = YAZ_ICONV_E2BIG;
782         return (size_t)(-1);
783     }
784     *outbuf = (char *) outp;
785     return 0;
786 }
787 #endif
788
789 int yaz_iconv_isbuiltin(yaz_iconv_t cd)
790 {
791     return cd->read_handle && cd->write_handle;
792 }
793
794 yaz_iconv_t yaz_iconv_open (const char *tocode, const char *fromcode)
795 {
796     yaz_iconv_t cd = (yaz_iconv_t) xmalloc (sizeof(*cd));
797
798     cd->write_handle = 0;
799     cd->read_handle = 0;
800     cd->init_handle = 0;
801     cd->flush_handle = 0;
802     cd->my_errno = YAZ_ICONV_UNKNOWN;
803
804     /* a useful hack: if fromcode has leading @,
805        the library not use YAZ's own conversions .. */
806     if (fromcode[0] == '@')
807         fromcode++;
808     else
809     {
810         if (!yaz_matchstr(fromcode, "UTF8"))
811         {
812             cd->read_handle = yaz_read_UTF8;
813             cd->init_handle = yaz_init_UTF8;
814         }
815         else if (!yaz_matchstr(fromcode, "ISO88591"))
816             cd->read_handle = yaz_read_ISO8859_1;
817         else if (!yaz_matchstr(fromcode, "UCS4"))
818             cd->read_handle = yaz_read_UCS4;
819         else if (!yaz_matchstr(fromcode, "UCS4LE"))
820             cd->read_handle = yaz_read_UCS4LE;
821         else if (!yaz_matchstr(fromcode, "MARC8"))
822             cd->read_handle = yaz_read_marc8;
823         else if (!yaz_matchstr(fromcode, "MARC8s"))
824             cd->read_handle = yaz_read_marc8s;
825         else if (!yaz_matchstr(fromcode, "advancegreek"))
826             cd->read_handle = yaz_read_advancegreek;
827         else if (!yaz_matchstr(fromcode, "iso54281984"))
828             cd->read_handle = yaz_read_iso5428_1984;
829         else if (!yaz_matchstr(fromcode, "iso5428:1984"))
830             cd->read_handle = yaz_read_iso5428_1984;
831 #if HAVE_WCHAR_H
832         else if (!yaz_matchstr(fromcode, "WCHAR_T"))
833             cd->read_handle = yaz_read_wchar_t;
834 #endif
835         
836         if (!yaz_matchstr(tocode, "UTF8"))
837             cd->write_handle = yaz_write_UTF8;
838         else if (!yaz_matchstr(tocode, "ISO88591"))
839         {
840             cd->write_handle = yaz_write_ISO8859_1;
841             cd->flush_handle = yaz_flush_ISO8859_1;
842         }
843         else if (!yaz_matchstr (tocode, "UCS4"))
844             cd->write_handle = yaz_write_UCS4;
845         else if (!yaz_matchstr(tocode, "UCS4LE"))
846             cd->write_handle = yaz_write_UCS4LE;
847         else if (!yaz_matchstr(tocode, "MARC8"))
848         {
849             cd->write_handle = yaz_write_marc8;
850             cd->flush_handle = yaz_flush_marc8;
851         }
852         else if (!yaz_matchstr(tocode, "MARC8s"))
853         {
854             cd->write_handle = yaz_write_marc8;
855             cd->flush_handle = yaz_flush_marc8;
856         }
857         else if (!yaz_matchstr(tocode, "advancegreek"))
858         {
859             cd->write_handle = yaz_write_advancegreek;
860         }
861         else if (!yaz_matchstr(tocode, "iso54281984"))
862         {
863             cd->write_handle = yaz_write_iso5428_1984;
864         }
865         else if (!yaz_matchstr(tocode, "iso5428:1984"))
866         {
867             cd->write_handle = yaz_write_iso5428_1984;
868         }
869 #if HAVE_WCHAR_H
870         else if (!yaz_matchstr(tocode, "WCHAR_T"))
871             cd->write_handle = yaz_write_wchar_t;
872 #endif
873     }
874 #if HAVE_ICONV_H
875     cd->iconv_cd = 0;
876     if (!cd->read_handle || !cd->write_handle)
877     {
878         cd->iconv_cd = iconv_open (tocode, fromcode);
879         if (cd->iconv_cd == (iconv_t) (-1))
880         {
881             xfree (cd);
882             return 0;
883         }
884     }
885 #else
886     if (!cd->read_handle || !cd->write_handle)
887     {
888         xfree (cd);
889         return 0;
890     }
891 #endif
892     cd->init_flag = 1;
893     return cd;
894 }
895
896 size_t yaz_iconv(yaz_iconv_t cd, char **inbuf, size_t *inbytesleft,
897                  char **outbuf, size_t *outbytesleft)
898 {
899     char *inbuf0 = 0;
900     size_t r = 0;
901
902 #if HAVE_ICONV_H
903     if (cd->iconv_cd)
904     {
905         size_t r =
906             iconv(cd->iconv_cd, inbuf, inbytesleft, outbuf, outbytesleft);
907         if (r == (size_t)(-1))
908         {
909             switch (yaz_errno())
910             {
911             case E2BIG:
912                 cd->my_errno = YAZ_ICONV_E2BIG;
913                 break;
914             case EINVAL:
915                 cd->my_errno = YAZ_ICONV_EINVAL;
916                 break;
917             case EILSEQ:
918                 cd->my_errno = YAZ_ICONV_EILSEQ;
919                 break;
920             default:
921                 cd->my_errno = YAZ_ICONV_UNKNOWN;
922             }
923         }
924         return r;
925     }
926 #endif
927
928     if (inbuf)
929         inbuf0 = *inbuf;
930
931     if (cd->init_flag)
932     {
933         cd->my_errno = YAZ_ICONV_UNKNOWN;
934         cd->g0_mode = 'B';
935         cd->g1_mode = 'E';
936         
937         cd->comb_offset = cd->comb_size = 0;
938         cd->compose_char = 0;
939         
940         cd->write_marc8_second_half_char = 0;
941         cd->write_marc8_last = 0;
942         cd->write_marc8_lpage = 0;
943         cd->write_marc8_g0 = ESC "(B";
944         cd->write_marc8_g1 = 0;
945         
946         cd->unget_x = 0;
947         cd->no_read_x = 0;
948     }
949
950     if (cd->init_flag)
951     {
952         if (cd->init_handle && inbuf && *inbuf)
953         {
954             size_t no_read = 0;
955             size_t r = (cd->init_handle)(cd, (unsigned char *) *inbuf,
956                                          *inbytesleft, &no_read);
957             if (r)
958             {
959                 if (cd->my_errno == YAZ_ICONV_EINVAL)
960                     return r;
961                 cd->init_flag = 0;
962                 return r;
963             }
964             *inbytesleft -= no_read;
965             *inbuf += no_read;
966         }
967     }
968     cd->init_flag = 0;
969
970     if (!inbuf || !*inbuf)
971     {
972         if (outbuf && *outbuf)
973         {
974             if (cd->unget_x)
975                 r = (*cd->write_handle)(cd, cd->unget_x, outbuf, outbytesleft);
976             if (cd->flush_handle)
977                 r = (*cd->flush_handle)(cd, outbuf, outbytesleft);
978         }
979         if (r == 0)
980             cd->init_flag = 1;
981         cd->unget_x = 0;
982         return r;
983     }
984     while (1)
985     {
986         unsigned long x;
987         size_t no_read;
988
989         if (cd->unget_x)
990         {
991             x = cd->unget_x;
992             no_read = cd->no_read_x;
993         }
994         else
995         {
996             if (*inbytesleft == 0)
997             {
998                 r = *inbuf - inbuf0;
999                 break;
1000             }
1001             x = (*cd->read_handle)(cd, (unsigned char *) *inbuf, *inbytesleft,
1002                                    &no_read);
1003             if (no_read == 0)
1004             {
1005                 r = (size_t)(-1);
1006                 break;
1007             }
1008         }
1009         if (x)
1010         {
1011             r = (*cd->write_handle)(cd, x, outbuf, outbytesleft);
1012             if (r)
1013             {
1014                 /* unable to write it. save it because read_handle cannot
1015                    rewind .. */
1016                 if (cd->my_errno == YAZ_ICONV_E2BIG)
1017                 {
1018                     cd->unget_x = x;
1019                     cd->no_read_x = no_read;
1020                     break;
1021                 }
1022             }
1023             cd->unget_x = 0;
1024         }
1025         *inbytesleft -= no_read;
1026         (*inbuf) += no_read;
1027     }
1028     return r;
1029 }
1030
1031 int yaz_iconv_error (yaz_iconv_t cd)
1032 {
1033     return cd->my_errno;
1034 }
1035
1036 int yaz_iconv_close (yaz_iconv_t cd)
1037 {
1038 #if HAVE_ICONV_H
1039     if (cd->iconv_cd)
1040         iconv_close (cd->iconv_cd);
1041 #endif
1042     xfree (cd);
1043     return 0;
1044 }
1045
1046 void yaz_iconv_set_errno(yaz_iconv_t cd, int no)
1047 {
1048     cd->my_errno = no;
1049 }
1050
1051 /*
1052  * Local variables:
1053  * c-basic-offset: 4
1054  * indent-tabs-mode: nil
1055  * End:
1056  * vim: shiftwidth=4 tabstop=8 expandtab
1057  */